This paves the way for scalable PID allocation later. The 32 bit variant merely takes a spinlock for simplicity, the 64 bit variant uses a scalable scheme. Signed-off-by: Mateusz Guzik --- this patch assumes the rb -> rhashtable conversion landed i booted the 32 bit code on the 64 bit kernel, i take it its fine I'm slightly worried about error handling. It seems pid->pidfs_hash.next = NULL is supposed to sort it out. Given that ino of 0 is not legal, I think it should be used as a sentinel value for presence in the table instead. so something like: alloc_pid: pid->ino = 0; .... then: void pidfs_remove_pid(struct pid *pid) { if (unlikely(!pid->ino)) return; rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, pidfs_ino_ht_params); } fs/pidfs.c | 107 +++++++++++++++++++++++++++++++++++---------------- kernel/pid.c | 3 +- 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 3da5e8e0a76b..46b46a484d45 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -65,7 +65,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = { .automatic_shrinking = true, }; +/* + * inode number handling + * + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 2 + * so inode numbering starts at 2 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ + #if BITS_PER_LONG == 32 + +DEFINE_SPINLOCK(pidfs_ino_lock); +static u64 pidfs_ino_nr = 2; + static inline unsigned long pidfs_ino(u64 ino) { return lower_32_bits(ino); @@ -77,6 +109,18 @@ static inline u32 pidfs_gen(u64 ino) return upper_32_bits(ino); } +static inline u64 pidfs_alloc_ino(void) +{ + u64 ino; + + spin_lock(&pidfs_ino_lock); + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr += 2; + ino = pidfs_ino_nr++; + spin_unlock(&pidfs_ino_lock); + return ino; +} + #else /* On 64 bit simply return ino. */ @@ -90,53 +134,48 @@ static inline u32 pidfs_gen(u64 ino) { return 0; } -#endif /* - * Allocate inode number and initialize pidfs fields. - * Called with pidmap_lock held. + * A patched up copy of get_next_ino(). Uses 64 bit, does not do overflow checks + * and guarantees ino of at least 2. */ -void pidfs_prepare_pid(struct pid *pid) +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(u64, pidfs_last_ino); + +static u64 pidfs_alloc_ino(void) { - static u64 pidfs_ino_nr = 2; + u64 *p = &get_cpu_var(pidfs_last_ino); + u64 res = *p; + +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic64_t pidfs_shared_last_ino = ATOMIC_INIT(2); + u64 next = atomic64_add_return(LAST_INO_BATCH, &pidfs_shared_last_ino); + res = next - LAST_INO_BATCH; + } +#endif - /* - * On 64 bit nothing special happens. The 64bit number assigned - * to struct pid is the inode number. - * - * On 32 bit the 64 bit number assigned to struct pid is split - * into two 32 bit numbers. The lower 32 bits are used as the - * inode number and the upper 32 bits are used as the inode - * generation number. - * - * On 32 bit pidfs_ino() will return the lower 32 bit. When - * pidfs_ino() returns zero a wrap around happened. When a - * wraparound happens the 64 bit number will be incremented by 2 - * so inode numbering starts at 2 again. - * - * On 64 bit comparing two pidfds is as simple as comparing - * inode numbers. - * - * When a wraparound happens on 32 bit multiple pidfds with the - * same inode number are likely to exist (This isn't a problem - * since before pidfs pidfds used the anonymous inode meaning - * all pidfds had the same inode number.). Userspace can - * reconstruct the 64 bit identifier by retrieving both the - * inode number and the inode generation number to compare or - * use file handles. - */ - if (pidfs_ino(pidfs_ino_nr) == 0) - pidfs_ino_nr += 2; + res++; + *p = res; + put_cpu_var(pidfs_last_ino); + return res; +} + +#endif - pid->ino = pidfs_ino_nr; +/* + * Initialize pidfs fields. + */ +void pidfs_prepare_pid(struct pid *pid) +{ pid->pidfs_hash.next = NULL; pid->stashed = NULL; pid->attr = NULL; - pidfs_ino_nr++; } int pidfs_add_pid(struct pid *pid) { + pid->ino = pidfs_alloc_ino(); return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, pidfs_ino_ht_params); } diff --git a/kernel/pid.c b/kernel/pid.c index 06356e40ac00..72c9372b84b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); + pidfs_prepare_pid(pid); /* * 2. perm check checkpoint_restore_ns_capable() @@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -ENOMEM; if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) goto out_free; - pidfs_prepare_pid(pid); - for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); -- 2.48.1