Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to walk to specific table entries, or to perform actions on a range of entries. Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 386 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 39 +++++ 2 files changed, 425 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index a9d5b49ac411..3158652dbe8e 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -219,3 +219,389 @@ union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, g WRITE_ONCE(*ptep, new); return pgste; } + +/* + * dat_split_ste - Split a segment table entry into page table entries + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pgste pgste_init; + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + BUG_ON(!mc); + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(mc); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + pgste_init.val = 0; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* no need to take locks as the page table is not installed yet */ + pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.pcl = uses_skeys && init.h.i; + dat_init_pgstes(pt, pgste_init.val); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) { + if (!pgste_init.pcl) + return 0; + for (i = 0; i < _PAGE_ENTRIES; i++) { + union pgste pgste = pt->pgstes[i]; + + pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste); + pgste_set_unlock(pt->ptes + i, pgste); + } + return 0; + } + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +/* + * dat_split_crste - Split a crste into smaller crstes + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep, + gfn_t gfn, union asce asce, bool uses_skeys) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys); + + BUG_ON(!mc); + + /* Already split, nothing to do */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - walk the gmap page tables + * @mc: cache to use to allocate dat tables, if needed; can be NULL if neither + * @DAT_WALK_SPLIT or @DAT_WALK_ALLOC is specified. + * @gfn: guest frame + * @asce: the ASCE of the address space + * @flags: flags from WALK_* macros + * @walk_level: level to walk to, from LEVEL_* macros + * @last: will be filled the last visited non-pte DAT entry + * @ptepp: will be filled the last visited pte entry, if any, otherwise NULL + * + * Returns a table entry pointer for the given guest address and @level + * + * The @flags have the following meanings: + * * @DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * @DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * @DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * @DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * @DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * @DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * * @DAT_WALK_USES_SKEYS: storage keys are in use + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * -EFAULT if the requested address lies inside a memory hole of a different type + * * -EINVAL if the given ASCE is not compatible with the requested level + * * -EFBIG if the requested level could not be reached because a larger frame was found + * * -ENOENT if the requested level could not be reached for other reasons + * * -ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp) +{ + union vaddress vaddr = { .addr = gfn_to_gpa(gfn) }; + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool uses_skeys = flags & DAT_WALK_USES_SKEYS; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + vaddr.rfx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION1) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + vaddr.rsx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION2) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + vaddr.rtx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION3 && + continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + vaddr.sx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + vaddr.px; + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - walk DAT tables + * @start: the first guest page frame to walk + * @end: the guest page frame immediately after the last one to walk + * @asce: the ASCE of the guest mapping + * @ops: the gmap_walk_ops that will be used to perform the walk + * @flags: flags from WALK_* (currently only WALK_IGN_HOLES is supported) + * @priv: will be passed as-is to the callbacks + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: -EINVAL in case of error, -EFAULT if @start is too high for the given + * asce unless the DAT_WALK_IGN_HOLES flag is specified, otherwise it + * returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 6a336c3c6f62..5488bdc1a79b 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -45,6 +45,7 @@ enum { #define TABLE_TYPE_PAGE_TABLE -1 enum dat_walk_flags { + DAT_WALK_USES_SKEYS = 0x40, DAT_WALK_CONTINUE = 0x20, DAT_WALK_IGN_HOLES = 0x10, DAT_WALK_SPLIT = 0x08, @@ -332,6 +333,34 @@ struct page_table { static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); static_assert(sizeof(struct page_table) == PAGE_SIZE); +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + /** * _pte() - Useful constructor for union pte * @pfn: the pfn this pte should point to. @@ -436,6 +465,11 @@ bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste ne union asce asce); void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp); void dat_free_level(struct crst_table *table, bool owns_ptes); struct crst_table *dat_alloc_crst_sleepable(unsigned long init); @@ -834,4 +868,9 @@ static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce a dat_crstep_xchg(crstep, newcrste, gfn, asce); } +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; +} + #endif /* __KVM_S390_DAT_H */ -- 2.51.1