From 5fdee8c4a5e1800489ce61963208f8cc55e42ea1 Mon Sep 17 00:00:00 2001 From: Salman Date: Tue, 10 Aug 2010 18:03:16 -0700 Subject: pids: fix a race in pid generation that causes pids to be reused immediately A program that repeatedly forks and waits is susceptible to having the same pid repeated, especially when it competes with another instance of the same program. This is really bad for bash implementation. Furthermore, many shell scripts assume that pid numbers will not be used for some length of time. Race Description: A B // pid == offset == n // pid == offset == n + 1 test_and_set_bit(offset, map->page) test_and_set_bit(offset, map->page); pid_ns->last_pid = pid; pid_ns->last_pid = pid; // pid == n + 1 is freed (wait()) // Next fork()... last = pid_ns->last_pid; // == n pid = last + 1; Code to reproduce it (Running multiple instances is more effective): #include #include #include #include #include #include // The distance mod 32768 between two pids, where the first pid is expected // to be smaller than the second. int PidDistance(pid_t first, pid_t second) { return (second + 32768 - first) % 32768; } int main(int argc, char* argv[]) { int failed = 0; pid_t last_pid = 0; int i; printf("%d\n", sizeof(pid_t)); for (i = 0; i < 10000000; ++i) { if (i % 32786 == 0) printf("Iter: %d\n", i/32768); int child_exit_code = i % 256; pid_t pid = fork(); if (pid == -1) { fprintf(stderr, "fork failed, iteration %d, errno=%d", i, errno); exit(1); } if (pid == 0) { // Child exit(child_exit_code); } else { // Parent if (i > 0) { int distance = PidDistance(last_pid, pid); if (distance == 0 || distance > 30000) { fprintf(stderr, "Unexpected pid sequence: previous fork: pid=%d, " "current fork: pid=%d for iteration=%d.\n", last_pid, pid, i); failed = 1; } } last_pid = pid; int status; int reaped = wait(&status); if (reaped != pid) { fprintf(stderr, "Wait return value: expected pid=%d, " "got %d, iteration %d\n", pid, reaped, i); failed = 1; } else if (WEXITSTATUS(status) != child_exit_code) { fprintf(stderr, "Unexpected exit status %x, iteration %d\n", WEXITSTATUS(status), i); failed = 1; } } } exit(failed); } Thanks to Ted Tso for the key ideas of this implementation. Signed-off-by: Salman Qazi Cc: Ingo Molnar Cc: Theodore Ts'o Cc: Peter Zijlstra Cc: Sukadev Bhattiprolu Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c132d2..fbbd5f6b6f2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid) atomic_inc(&map->nr_free); } +/* + * If we started walking pids at 'base', is 'a' seen before 'b'? + */ +static int pid_before(int base, int a, int b) +{ + /* + * This is the same as saying + * + * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT + * and that mapping orders 'a' and 'b' with respect to 'base'. + */ + return (unsigned)(a - base) < (unsigned)(b - base); +} + +/* + * We might be racing with someone else trying to set pid_ns->last_pid. + * We want the winner to have the "later" value, because if the + * "earlier" value prevails, then a pid may get reused immediately. + * + * Since pids rollover, it is not sufficient to just pick the bigger + * value. We have to consider where we started counting from. + * + * 'base' is the value of pid_ns->last_pid that we observed when + * we started looking for a pid. + * + * 'pid' is the pid that we eventually found. + */ +static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) +{ + int prev; + int last_write = base; + do { + prev = last_write; + last_write = cmpxchg(&pid_ns->last_pid, prev, pid); + } while ((prev != last_write) && (pid_before(base, last_write, pid))); +} + static int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; @@ -154,7 +191,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pid_ns->last_pid = pid; + set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); -- cgit v1.2.3-70-g09d2 From c52b0b91ba1f4b7ea90e20385c0a6df0ba54aed4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 10 Aug 2010 18:03:17 -0700 Subject: pids: alloc_pidmap: remove the unnecessary boundary checks alloc_pidmap() calculates max_scan so that if the initial offset != 0 we inspect the first map->page twice. This is correct, we want to find the unused bits < offset in this bitmap block. Add the comment. But it doesn't make any sense to stop the find_next_offset() loop when we are looking into this map->page for the second time. We have already already checked the bits >= offset during the first attempt, it is fine to do this again, no matter if we succeed this time or not. Remove this hard-to-understand code. It optimizes the very unlikely case when we are going to fail, but slows down the more likely case. Signed-off-by: Oleg Nesterov Cc: Salman Qazi Cc: Ingo Molnar Cc: Sukadev Bhattiprolu Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index fbbd5f6b6f2..d55c6fb8d08 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -169,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; + /* + * If last_pid points into the middle of the map->page we + * want to scan this bitmap block twice, the second time + * we start with offset == 0 (or RESERVED_PIDS). + */ + max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); @@ -196,15 +201,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) } offset = find_next_offset(map, offset); pid = mk_pid(pid_ns, map, offset); - /* - * find_next_offset() found a bit, the pid from it - * is in-bounds, and if we fell back to the last - * bitmap block and the final block was the same - * as the starting point, pid is before last_pid. - */ - } while (offset < BITS_PER_PAGE && pid < pid_max && - (i != max_scan || pid < last || - !((last+1) & BITS_PER_PAGE_MASK))); + } while (offset < BITS_PER_PAGE && pid < pid_max); } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; -- cgit v1.2.3-70-g09d2 From 67bdbffd696f29a0b68aa8daa285783a06651583 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2010 16:55:13 +0100 Subject: rculist: avoid __rcu annotations This avoids warnings from missing __rcu annotations in the rculist implementation, making it possible to use the same lists in both RCU and non-RCU cases. We can add rculist annotations later, together with lockdep support for rculist, which is missing as well, but that may involve changing all the users. Signed-off-by: Arnd Bergmann Signed-off-by: Paul E. McKenney Cc: Pavel Emelyanov Cc: Sukadev Bhattiprolu Reviewed-by: Josh Triplett --- include/linux/rculist.h | 53 +++++++++++++++++++++++++++---------------- include/linux/rculist_nulls.h | 16 +++++++++---- kernel/pid.c | 2 +- 3 files changed, 46 insertions(+), 25 deletions(-) (limited to 'kernel/pid.c') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4ec3b38ce9c..c10b1050dbe 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -9,6 +9,12 @@ #include #include +/* + * return the ->next pointer of a list_head in an rcu safe + * way, we must not access it directly + */ +#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) + /* * Insert a new entry between two known consecutive entries. * @@ -20,7 +26,7 @@ static inline void __list_add_rcu(struct list_head *new, { new->next = next; new->prev = prev; - rcu_assign_pointer(prev->next, new); + rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } @@ -138,7 +144,7 @@ static inline void list_replace_rcu(struct list_head *old, { new->next = old->next; new->prev = old->prev; - rcu_assign_pointer(new->prev->next, new); + rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } @@ -193,7 +199,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ last->next = at; - rcu_assign_pointer(head->next, first); + rcu_assign_pointer(list_next_rcu(head), first); first->prev = head; at->prev = last; } @@ -208,7 +214,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(rcu_dereference_raw(ptr), type, member) + ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \ + }) /** * list_first_entry_rcu - get the first element from a list @@ -225,9 +233,9 @@ static inline void list_splice_init_rcu(struct list_head *list, list_entry_rcu((ptr)->next, type, member) #define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw((head)->next); \ + for (pos = rcu_dereference_raw(list_next_rcu(head)); \ pos != (head); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(list_next_rcu((pos))) /** * list_for_each_entry_rcu - iterate over rcu list of given type @@ -257,9 +265,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ - for ((pos) = rcu_dereference_raw((pos)->next); \ + for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ prefetch((pos)->next), (pos) != (head); \ - (pos) = rcu_dereference_raw((pos)->next)) + (pos) = rcu_dereference_raw(list_next_rcu(pos))) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type @@ -314,12 +322,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old, new->next = next; new->pprev = old->pprev; - rcu_assign_pointer(*new->pprev, new); + rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) new->next->pprev = &new->next; old->pprev = LIST_POISON2; } +/* + * return the first or the next element in an RCU protected hlist + */ +#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) +#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) +#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) + /** * hlist_add_head_rcu * @n: the element to add to the hash list. @@ -346,7 +361,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_first_rcu(h), n); if (first) first->pprev = &n->next; } @@ -374,7 +389,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, { n->pprev = next->pprev; n->next = next; - rcu_assign_pointer(*(n->pprev), n); + rcu_assign_pointer(hlist_pprev_rcu(n), n); next->pprev = &n->next; } @@ -401,15 +416,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, { n->next = prev->next; n->pprev = &prev->next; - rcu_assign_pointer(prev->next, n); + rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) n->next->pprev = &n->next; } -#define __hlist_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->first); \ - pos && ({ prefetch(pos->next); 1; }); \ - pos = rcu_dereference(pos->next)) +#define __hlist_for_each_rcu(pos, head) \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ + pos && ({ prefetch(pos->next); 1; }); \ + pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type @@ -422,11 +437,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ -#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ +#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \ pos && ({ prefetch(pos->next); 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index b70ffe53cb9..2ae13714828 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) } } +#define hlist_nulls_first_rcu(head) \ + (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) + +#define hlist_nulls_next_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, n->next = first; n->pprev = &h->first; - rcu_assign_pointer(h->first, n); + rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) first->pprev = &n->next; } @@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @member: the name of the hlist_nulls_node within the struct. * */ -#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw((head)->first); \ - (!is_a_nulls(pos)) && \ +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) #endif #endif diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d08..0f90c2f713f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) -- cgit v1.2.3-70-g09d2 From 4221a9918e38b7494cee341dda7b7b4bb8c04bde Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 26 Jun 2010 01:08:19 +0900 Subject: Add RCU check for find_task_by_vpid(). find_task_by_vpid() says "Must be called under rcu_read_lock().". But due to commit 3120438 "rcu: Disable lockdep checking in RCU list-traversal primitives", we are currently unable to catch "find_task_by_vpid() with tasklist_lock held but RCU lock not held" errors due to the RCU-lockdep checks being suppressed in the RCU variants of the struct list_head traversals. This commit therefore places an explicit check for being in an RCU read-side critical section in find_task_by_pid_ns(). =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- kernel/pid.c:386 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by rc.sysinit/1102: #0: (tasklist_lock){.+.+..}, at: [] sys_setpgid+0x40/0x160 stack backtrace: Pid: 1102, comm: rc.sysinit Not tainted 2.6.35-rc3-dirty #1 Call Trace: [] lockdep_rcu_dereference+0x94/0xb0 [] find_task_by_pid_ns+0x6d/0x70 [] find_task_by_vpid+0x18/0x20 [] sys_setpgid+0x47/0x160 [] sysenter_do_call+0x12/0x36 Commit updated to use a new rcu_lockdep_assert() exported API rather than the old internal __do_rcu_dereference(). Signed-off-by: Tetsuo Handa Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 14 +++++++++----- kernel/pid.c | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel/pid.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b973dea2d6b..b124bc6a75a 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -215,7 +215,11 @@ static inline int rcu_read_lock_sched_held(void) extern int rcu_my_thread_group_empty(void); -#define __do_rcu_dereference_check(c) \ +/** + * rcu_lockdep_assert - emit lockdep splat if specified condition not met + * @c: condition to check + */ +#define rcu_lockdep_assert(c) \ do { \ static bool __warned; \ if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ @@ -226,7 +230,7 @@ extern int rcu_my_thread_group_empty(void); #else /* #ifdef CONFIG_PROVE_RCU */ -#define __do_rcu_dereference_check(c) do { } while (0) +#define rcu_lockdep_assert(c) do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ @@ -247,14 +251,14 @@ extern int rcu_my_thread_group_empty(void); #define __rcu_dereference_check(p, c, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ (void) (((typeof (*p) space *)p) == p); \ smp_read_barrier_depends(); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ (void) (((typeof (*p) space *)p) == p); \ ((typeof(*p) __force __kernel *)(p)); \ }) @@ -262,7 +266,7 @@ extern int rcu_my_thread_group_empty(void); #define __rcu_dereference_index_check(p, c) \ ({ \ typeof(p) _________p1 = ACCESS_ONCE(p); \ - __do_rcu_dereference_check(c); \ + rcu_lockdep_assert(c); \ smp_read_barrier_depends(); \ (_________p1); \ }) diff --git a/kernel/pid.c b/kernel/pid.c index 0f90c2f713f..39b65b69584 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } -- cgit v1.2.3-70-g09d2