Add page table management functions to be used for KVM guest (gmap) page tables. This patch adds functions to walk to specific table entries, or to perform actions on a range of entries. Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/dat.c | 351 ++++++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/dat.h | 38 +++++ 2 files changed, 389 insertions(+) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index f26e3579bd77..fe93e1c07158 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -209,3 +209,354 @@ union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, g WRITE_ONCE(*ptep, new); return pgste; } + +/* + * dat_split_pmd is assumed to be called with mmap_lock held in read or write mode + */ +static int dat_split_pmd(union pmd *pmdp, gfn_t gfn, union asce asce) +{ + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* no need to take locks as the page table is not installed yet */ + dat_init_pgstes(pt, old.s.fc1.prefix_notif ? PGSTE_IN_BIT : 0); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +static int dat_split_crste(union crste *crstep, gfn_t gfn, union asce asce) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_pmd(&crstep->pmd, gfn, asce); + + /* Already split, nothing to do */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - walk the gmap page tables + * @gfn: guest frame + * @asce: the ASCE of the address space + * @flags: flags from WALK_* macros + * @level: level to walk to, from LEVEL_* macros + * @last: will be filled the last visited non-pte DAT entry + * @ptepp: will be filled the last visited pte entry, if any, otherwise NULL + * + * Returns a table entry pointer for the given guest address and @level + * + * The @flags have the following meanings: + * * @DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * @DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * @DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * @DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * @DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * @DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * -EFAULT if the requested address lies inside a memory hole of a different type + * * -EINVAL if the given ASCE is not compatible with the requested level + * * -EFBIG if the requested level could not be reached because a larger frame was found + * * -ENOENT if the requested level could not be reached for other reasons + * * -ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(gfn_t gfn, union asce asce, int flags, int walk_level, + union crste **last, union pte **ptepp) +{ + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + pgd_index(gfn_to_gpa(gfn)); + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(unlikely(entry.h.tt != LEVEL_PGD))) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == LEVEL_PGD) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + p4d_index(gfn_to_gpa(gfn)); + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(unlikely(entry.h.tt != LEVEL_P4D))) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == LEVEL_P4D) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + pud_index(gfn_to_gpa(gfn)); + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(unlikely(entry.h.tt != LEVEL_PUD))) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == LEVEL_PUD && continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = LEVEL_PTE; + allocate = false; + } + if (walk_level == LEVEL_PUD || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= LEVEL_PMD && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + pmd_index(gfn_to_gpa(gfn)); + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(unlikely(entry.h.tt != LEVEL_PMD))) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = LEVEL_PTE; + allocate = false; + } + if (walk_level == LEVEL_PMD || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= LEVEL_PTE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(*last, gfn, asce); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + pte_index(gfn_to_gpa(gfn)); + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - walk DAT tables + * @start: the first guest page frame to walk + * @end: the guest page frame immediately after the last one to walk + * @asce: the ASCE of the guest mapping + * @ops: the gmap_walk_ops that will be used to perform the walk + * @flags: flags from WALK_* (currently only WALK_IGN_HOLES is supported) + * @priv: will be passed as-is to the callbacks + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: -EINVAL in case of error, -EFAULT if @start is too high for the given + * asce unless the DAT_WALK_IGN_HOLES flag is specified, otherwise it + * returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 9e23f6cdbf73..de4bd2298945 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -346,6 +346,34 @@ struct page_table { static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); static_assert(sizeof(struct page_table) == PAGE_SIZE); +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + static inline union pte _pte(kvm_pfn_t pfn, bool w, bool d, bool s) { union pte res = { .val = PFN_PHYS(pfn) }; @@ -391,6 +419,11 @@ bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste ne union asce asce); void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(gfn_t gfn, union asce asce, int flags, int walk_level, + union crste **last, union pte **ptepp); void dat_free_level(struct crst_table *table, bool owns_ptes); struct page_table *dat_alloc_pt(unsigned long pte_bits, unsigned long pgste_bits); struct crst_table *dat_alloc_crst(unsigned long init); @@ -734,4 +767,9 @@ static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce a dat_crstep_xchg(crstep, newcrste, gfn, asce); } +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? LEVEL_PTE : crstep->h.tt; +} + #endif /* __KVM_S390_DAT_H */ -- 2.51.0