From 6062a8dc0517bce23e3c2f7d2fea5e22411269a3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Apr 2013 19:15:44 -0700 Subject: [PATCH] ipc,sem: fine grained locking for semtimedop Introduce finer grained locking for semtimedop, to handle the common case of a program wanting to manipulate one semaphore from an array with multiple semaphores. If the call is a semop manipulating just one semaphore in an array with multiple semaphores, only take the lock for that semaphore itself. If the call needs to manipulate multiple semaphores, or another caller is in a transaction that manipulates multiple semaphores, the sem_array lock is taken, as well as all the locks for the individual semaphores. On a 24 CPU system, performance numbers with the semop-multi test with N threads and N semaphores, look like this: vanilla Davidlohr's Davidlohr's + Davidlohr's + threads patches rwlock patches v3 patches 10 610652 726325 1783589 2142206 20 341570 365699 1520453 1977878 30 288102 307037 1498167 2037995 40 290714 305955 1612665 2256484 50 288620 312890 1733453 2650292 60 289987 306043 1649360 2388008 70 291298 306347 1723167 2717486 80 290948 305662 1729545 2763582 90 290996 306680 1736021 2757524 100 292243 306700 1773700 3059159 [davidlohr.bueso@hp.com: do not call sem_lock when bogus sma] [davidlohr.bueso@hp.com: make refcounter atomic] Signed-off-by: Rik van Riel Suggested-by: Linus Torvalds Acked-by: Davidlohr Bueso Cc: Chegu Vinod Cc: Jason Low Reviewed-by: Michel Lespinasse Cc: Peter Hurley Cc: Stanislav Kinsbursky Tested-by: Emmanuel Benisty Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 7 +- ipc/sem.c | 271 +++++++++++++++++++++++++++++++++-------------------- ipc/util.c | 48 +++++----- ipc/util.h | 2 +- 4 files changed, 203 insertions(+), 125 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index a80aaf463d9c..09a1f41e6595 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -687,7 +687,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock_free; } ss_add(msq, &s); - ipc_rcu_getref(msq); + + if (!ipc_rcu_getref(msq)) { + err = -EIDRM; + goto out_unlock_free; + } + msg_unlock(msq); schedule(); diff --git a/ipc/sem.c b/ipc/sem.c index f68b61749a85..e78ee3186d1f 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -94,6 +94,7 @@ struct sem { int semval; /* current value */ int sempid; /* pid of last operation */ + spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head sem_pending; /* pending single-sop operations */ }; @@ -137,7 +138,6 @@ struct sem_undo_list { #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) -#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) #define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) static int newary(struct ipc_namespace *, struct ipc_params *); @@ -189,11 +189,90 @@ void __init sem_init (void) IPC_SEM_IDS, sysvipc_sem_proc_show); } +/* + * If the request contains only one semaphore operation, and there are + * no complex transactions pending, lock only the semaphore involved. + * Otherwise, lock the entire semaphore array, since we either have + * multiple semaphores in our own semops, or we need to look at + * semaphores from other pending complex operations. + * + * Carefully guard against sma->complex_count changing between zero + * and non-zero while we are spinning for the lock. The value of + * sma->complex_count cannot change while we are holding the lock, + * so sem_unlock should be fine. + * + * The global lock path checks that all the local locks have been released, + * checking each local lock once. This means that the local lock paths + * cannot start their critical sections while the global lock is held. + */ +static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, + int nsops) +{ + int locknum; + again: + if (nsops == 1 && !sma->complex_count) { + struct sem *sem = sma->sem_base + sops->sem_num; + + /* Lock just the semaphore we are interested in. */ + spin_lock(&sem->lock); + + /* + * If sma->complex_count was set while we were spinning, + * we may need to look at things we did not lock here. + */ + if (unlikely(sma->complex_count)) { + spin_unlock(&sem->lock); + goto lock_array; + } + + /* + * Another process is holding the global lock on the + * sem_array; we cannot enter our critical section, + * but have to wait for the global lock to be released. + */ + if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { + spin_unlock(&sem->lock); + spin_unlock_wait(&sma->sem_perm.lock); + goto again; + } + + locknum = sops->sem_num; + } else { + int i; + /* + * Lock the semaphore array, and wait for all of the + * individual semaphore locks to go away. The code + * above ensures no new single-lock holders will enter + * their critical section while the array lock is held. + */ + lock_array: + spin_lock(&sma->sem_perm.lock); + for (i = 0; i < sma->sem_nsems; i++) { + struct sem *sem = sma->sem_base + i; + spin_unlock_wait(&sem->lock); + } + locknum = -1; + } + return locknum; +} + +static inline void sem_unlock(struct sem_array *sma, int locknum) +{ + if (locknum == -1) { + spin_unlock(&sma->sem_perm.lock); + } else { + struct sem *sem = sma->sem_base + locknum; + spin_unlock(&sem->lock); + } + rcu_read_unlock(); +} + /* * sem_lock_(check_) routines are called in the paths where the rw_mutex * is not held. */ -static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id) +static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, + int id, struct sembuf *sops, int nsops, int *locknum) { struct kern_ipc_perm *ipcp; struct sem_array *sma; @@ -205,7 +284,8 @@ static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id goto err; } - spin_lock(&ipcp->lock); + sma = container_of(ipcp, struct sem_array, sem_perm); + *locknum = sem_lock(sma, sops, nsops); /* ipc_rmid() may have already freed the ID while sem_lock * was spinning: verify that the structure is still valid @@ -213,7 +293,7 @@ static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, int id if (!ipcp->deleted) return container_of(ipcp, struct sem_array, sem_perm); - spin_unlock(&ipcp->lock); + sem_unlock(sma, *locknum); sma = ERR_PTR(-EINVAL); err: rcu_read_unlock(); @@ -230,17 +310,6 @@ static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int return container_of(ipcp, struct sem_array, sem_perm); } -static inline struct sem_array *sem_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&sem_ids(ns), id); - - if (IS_ERR(ipcp)) - return ERR_CAST(ipcp); - - return container_of(ipcp, struct sem_array, sem_perm); -} - static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, int id) { @@ -254,21 +323,21 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { - ipc_lock_by_ptr(&sma->sem_perm); + rcu_read_lock(); + sem_lock(sma, NULL, -1); ipc_rcu_putref(sma); } static inline void sem_getref_and_unlock(struct sem_array *sma) { - ipc_rcu_getref(sma); - ipc_unlock(&(sma)->sem_perm); + WARN_ON_ONCE(!ipc_rcu_getref(sma)); + sem_unlock(sma, -1); } static inline void sem_putref(struct sem_array *sma) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - ipc_unlock(&(sma)->sem_perm); + sem_lock_and_putref(sma); + sem_unlock(sma, -1); } /* @@ -276,9 +345,9 @@ static inline void sem_putref(struct sem_array *sma) */ static inline void sem_getref(struct sem_array *sma) { - spin_lock(&(sma)->sem_perm.lock); - ipc_rcu_getref(sma); - ipc_unlock(&(sma)->sem_perm); + sem_lock(sma, NULL, -1); + WARN_ON_ONCE(!ipc_rcu_getref(sma)); + sem_unlock(sma, -1); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -371,15 +440,17 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_base = (struct sem *) &sma[1]; - for (i = 0; i < nsems; i++) + for (i = 0; i < nsems; i++) { INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); + spin_lock_init(&sma->sem_base[i].lock); + } sma->complex_count = 0; INIT_LIST_HEAD(&sma->sem_pending); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); - sem_unlock(sma); + sem_unlock(sma, -1); return sma->sem_perm.id; } @@ -818,7 +889,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); ns->used_sems -= sma->sem_nsems; @@ -947,7 +1018,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, struct sem_array *sma; struct sem* curr; int err; - int nsems; struct list_head tasks; int val; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) @@ -958,31 +1028,39 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, val = arg; #endif - sma = sem_lock_check(ns, semid); - if (IS_ERR(sma)) - return PTR_ERR(sma); + if (val > SEMVMX || val < 0) + return -ERANGE; INIT_LIST_HEAD(&tasks); - nsems = sma->sem_nsems; - err = -EACCES; - if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) - goto out_unlock; + rcu_read_lock(); + sma = sem_obtain_object_check(ns, semid); + if (IS_ERR(sma)) { + rcu_read_unlock(); + return PTR_ERR(sma); + } + + if (semnum < 0 || semnum >= sma->sem_nsems) { + rcu_read_unlock(); + return -EINVAL; + } + + + if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { + rcu_read_unlock(); + return -EACCES; + } err = security_sem_semctl(sma, SETVAL); - if (err) - goto out_unlock; + if (err) { + rcu_read_unlock(); + return -EACCES; + } - err = -EINVAL; - if(semnum < 0 || semnum >= nsems) - goto out_unlock; + sem_lock(sma, NULL, -1); curr = &sma->sem_base[semnum]; - err = -ERANGE; - if (val > SEMVMX || val < 0) - goto out_unlock; - assert_spin_locked(&sma->sem_perm.lock); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; @@ -992,11 +1070,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &tasks); - err = 0; -out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); - return err; + return 0; } static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, @@ -1051,16 +1127,16 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); err = -EIDRM; goto out_free; } - } + } else + sem_lock(sma, NULL, -1); - spin_lock(&sma->sem_perm.lock); for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sem_base[i].semval; - sem_unlock(sma); + sem_unlock(sma, -1); err = 0; if(copy_to_user(array, sem_io, nsems*sizeof(ushort))) err = -EFAULT; @@ -1071,7 +1147,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; struct sem_undo *un; - ipc_rcu_getref(sma); + if (!ipc_rcu_getref(sma)) { + rcu_read_unlock(); + return -EIDRM; + } rcu_read_unlock(); if(nsems > SEMMSL_FAST) { @@ -1097,7 +1176,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); err = -EIDRM; goto out_free; } @@ -1124,7 +1203,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_wakeup; } - spin_lock(&sma->sem_perm.lock); + sem_lock(sma, NULL, -1); curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1143,7 +1222,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); out_wakeup: wake_up_sem_queue_do(&tasks); out_free: @@ -1211,11 +1290,11 @@ static int semctl_down(struct ipc_namespace *ns, int semid, switch(cmd){ case IPC_RMID: - ipc_lock_object(&sma->sem_perm); + sem_lock(sma, NULL, -1); freeary(ns, ipcp); goto out_up; case IPC_SET: - ipc_lock_object(&sma->sem_perm); + sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64.sem_perm, ipcp); if (err) goto out_unlock; @@ -1228,7 +1307,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, } out_unlock: - sem_unlock(sma); + sem_unlock(sma, -1); out_up: up_write(&sem_ids(ns).rw_mutex); return err; @@ -1340,8 +1419,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) struct sem_array *sma; struct sem_undo_list *ulp; struct sem_undo *un, *new; - int nsems; - int error; + int nsems, error; error = get_undo_list(&ulp); if (error) @@ -1363,7 +1441,11 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) } nsems = sma->sem_nsems; - ipc_rcu_getref(sma); + if (!ipc_rcu_getref(sma)) { + rcu_read_unlock(); + un = ERR_PTR(-EIDRM); + goto out; + } rcu_read_unlock(); /* step 2: allocate new undo structure */ @@ -1376,7 +1458,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 3: Acquire the lock on semaphore array */ sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma); + sem_unlock(sma, -1); kfree(new); un = ERR_PTR(-EIDRM); goto out; @@ -1404,7 +1486,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) success: spin_unlock(&ulp->lock); rcu_read_lock(); - sem_unlock(sma); + sem_unlock(sma, -1); out: return un; } @@ -1444,7 +1526,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf* sops = fast_sops, *sop; struct sem_undo *un; - int undos = 0, alter = 0, max; + int undos = 0, alter = 0, max, locknum; struct sem_queue queue; unsigned long jiffies_left = 0; struct ipc_namespace *ns; @@ -1488,22 +1570,23 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, alter = 1; } + INIT_LIST_HEAD(&tasks); + if (undos) { + /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out_free; } - } else + } else { un = NULL; + rcu_read_lock(); + } - INIT_LIST_HEAD(&tasks); - - rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { - if (un) - rcu_read_unlock(); + rcu_read_unlock(); error = PTR_ERR(sma); goto out_free; } @@ -1534,23 +1617,9 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * "un" itself is guaranteed by rcu. */ error = -EIDRM; - ipc_lock_object(&sma->sem_perm); - if (un) { - if (un->semid == -1) { - rcu_read_unlock(); - goto out_unlock_free; - } else { - /* - * rcu lock can be released, "un" cannot disappear: - * - sem_lock is acquired, thus IPC_RMID is - * impossible. - * - exit_sem is impossible, it always operates on - * current (or a dead task). - */ - - rcu_read_unlock(); - } - } + locknum = sem_lock(sma, sops, nsops); + if (un && un->semid == -1) + goto out_unlock_free; error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); if (error <= 0) { @@ -1591,7 +1660,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, sleep_again: current->state = TASK_INTERRUPTIBLE; - sem_unlock(sma); + sem_unlock(sma, locknum); if (timeout) jiffies_left = schedule_timeout(jiffies_left); @@ -1613,7 +1682,7 @@ sleep_again: goto out_free; } - sma = sem_obtain_lock(ns, semid); + sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); /* * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. @@ -1652,7 +1721,7 @@ sleep_again: unlink_queue(sma, &queue); out_unlock_free: - sem_unlock(sma); + sem_unlock(sma, locknum); out_wakeup: wake_up_sem_queue_do(&tasks); out_free: @@ -1716,8 +1785,7 @@ void exit_sem(struct task_struct *tsk) struct sem_array *sma; struct sem_undo *un; struct list_head tasks; - int semid; - int i; + int semid, i; rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, @@ -1726,23 +1794,26 @@ void exit_sem(struct task_struct *tsk) semid = -1; else semid = un->semid; - rcu_read_unlock(); - if (semid == -1) + if (semid == -1) { + rcu_read_unlock(); break; + } - sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid); - + sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid); /* exit_sem raced with IPC_RMID, nothing to do */ - if (IS_ERR(sma)) + if (IS_ERR(sma)) { + rcu_read_unlock(); continue; + } + sem_lock(sma, NULL, -1); un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created * exactly the same semid. Nothing to do. */ - sem_unlock(sma); + sem_unlock(sma, -1); continue; } @@ -1782,7 +1853,7 @@ void exit_sem(struct task_struct *tsk) /* maybe some queued-up processes were waiting for this */ INIT_LIST_HEAD(&tasks); do_smart_update(sma, NULL, 0, 1, &tasks); - sem_unlock(sma); + sem_unlock(sma, -1); wake_up_sem_queue_do(&tasks); kfree_rcu(un, rcu); diff --git a/ipc/util.c b/ipc/util.c index 3df0af3158a5..579201e4bc01 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -439,9 +439,9 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) * NULL is returned if the allocation fails */ -void* ipc_alloc(int size) +void *ipc_alloc(int size) { - void* out; + void *out; if(size > PAGE_SIZE) out = vmalloc(size); else @@ -478,7 +478,7 @@ void ipc_free(void* ptr, int size) */ struct ipc_rcu_hdr { - int refcount; + atomic_t refcount; int is_vmalloc; void *data[0]; }; @@ -516,39 +516,41 @@ static inline int rcu_use_vmalloc(int size) * @size: size desired * * Allocate memory for the rcu header structure + the object. - * Returns the pointer to the object. - * NULL is returned if the allocation fails. + * Returns the pointer to the object or NULL upon failure. */ - -void* ipc_rcu_alloc(int size) +void *ipc_rcu_alloc(int size) { - void* out; - /* + void *out; + + /* * We prepend the allocation with the rcu struct, and - * workqueue if necessary (for vmalloc). + * workqueue if necessary (for vmalloc). */ if (rcu_use_vmalloc(size)) { out = vmalloc(HDRLEN_VMALLOC + size); - if (out) { - out += HDRLEN_VMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; - container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; - } + if (!out) + goto done; + + out += HDRLEN_VMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; } else { out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); - if (out) { - out += HDRLEN_KMALLOC; - container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; - container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; - } + if (!out) + goto done; + + out += HDRLEN_KMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; } + /* set reference counter no matter what kind of allocation was done */ + atomic_set(&container_of(out, struct ipc_rcu_hdr, data)->refcount, 1); +done: return out; } -void ipc_rcu_getref(void *ptr) +int ipc_rcu_getref(void *ptr) { - container_of(ptr, struct ipc_rcu_hdr, data)->refcount++; + return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu_hdr, data)->refcount); } static void ipc_do_vfree(struct work_struct *work) @@ -578,7 +580,7 @@ static void ipc_schedule_free(struct rcu_head *head) void ipc_rcu_putref(void *ptr) { - if (--container_of(ptr, struct ipc_rcu_hdr, data)->refcount > 0) + if (!atomic_dec_and_test(&container_of(ptr, struct ipc_rcu_hdr, data)->refcount)) return; if (container_of(ptr, struct ipc_rcu_hdr, data)->is_vmalloc) { diff --git a/ipc/util.h b/ipc/util.h index c36b9977c957..2b0bdd5d92ce 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -119,7 +119,7 @@ void ipc_free(void* ptr, int size); * to 0 schedules the rcu destruction. Caller must guarantee locking. */ void* ipc_rcu_alloc(int size); -void ipc_rcu_getref(void *ptr); +int ipc_rcu_getref(void *ptr); void ipc_rcu_putref(void *ptr); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); -- 2.39.5