From 3854a771821c970065e3203a0b40ddc4101538cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:29 -0700 Subject: __exit_signal: don't take rcu lock There is no reason for rcu_read_lock() in __exit_signal(). tsk->sighand can only be changed if tsk does exec, obviously this is not possible. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 93d2711b938..a7799d8a640 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk) tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); -- cgit v1.2.3-70-g09d2 From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:37 -0700 Subject: introduce PF_KTHREAD flag Introduce the new PF_KTHREAD flag to mark the kernel threads. It is set by INIT_TASK() and copied to the forked childs (we could set it in kthreadd() along with PF_NOFREEZE instead). daemonize() was changed as well. In that case testing of PF_KTHREAD is racy, but daemonize() is hopeless anyway. This flag is cleared in do_execve(), before search_binary_handler(). Probably not the best place, we can do this in exec_mmap() or in start_thread(), or clear it along with PF_FORKNOEXEC. But I think this doesn't matter in practice, and if do_execve() fails kthread should die soon. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + include/linux/init_task.h | 2 +- include/linux/sched.h | 1 + kernel/exit.c | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index af249af4cca..cd2e8c9b124 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1326,6 +1326,7 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 93c45acf249..021d8e720c7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -122,7 +122,7 @@ extern struct group_info init_groups; .state = 0, \ .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ - .flags = 0, \ + .flags = PF_KTHREAD, \ .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 79e749dbf81..eec64a4adb9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1483,6 +1483,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_KTHREAD 0x00000020 /* I am a kernel thread */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/kernel/exit.c b/kernel/exit.c index a7799d8a640..28a44a2612d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -430,7 +430,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); -- cgit v1.2.3-70-g09d2 From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: turn mm->core_startup_done into the pointer to struct core_state mm->core_startup_done points to "struct completion startup_done" allocated on the coredump_wait()'s stack. Introduce the new structure, core_state, which holds this "struct completion". This way we can add more info visible to the threads participating in coredump without enlarging mm_struct. No changes in affected .o files. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 ++++---- include/linux/mm_types.h | 7 ++++++- kernel/exit.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index e347e6ed161..71734568f01 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1597,13 +1597,13 @@ static int coredump_wait(int exit_code) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion startup_done; + struct core_state core_state; struct completion *vfork_done; int core_waiters; init_completion(&mm->core_done); - init_completion(&startup_done); - mm->core_startup_done = &startup_done; + init_completion(&core_state.startup); + mm->core_state = &core_state; core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); @@ -1622,7 +1622,7 @@ static int coredump_wait(int exit_code) } if (core_waiters) - wait_for_completion(&startup_done); + wait_for_completion(&core_state.startup); fail: BUG_ON(mm->core_waiters); return core_waiters; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 02a27ae7853..97819efd233 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,6 +159,10 @@ struct vm_area_struct { #endif }; +struct core_state { + struct completion startup; +}; + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -220,7 +224,8 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ /* coredumping support */ - struct completion *core_startup_done, core_done; + struct core_state *core_state; + struct completion core_done; /* aio bits */ rwlock_t ioctx_list_lock; /* aio lock */ diff --git a/kernel/exit.c b/kernel/exit.c index 28a44a2612d..f7fa21dbced 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk) up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); if (!--mm->core_waiters) - complete(mm->core_startup_done); + complete(&mm->core_state->startup); up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); -- cgit v1.2.3-70-g09d2 From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: move mm->core_waiters into struct core_state Move mm->core_waiters into "struct core_state" allocated on stack. This shrinks mm_struct a little bit and allows further changes. This patch mostly does s/core_waiters/core_state. The only essential change is that coredump_wait() must clear mm->core_state before return. The coredump_wait()'s path is uglified and .text grows by 30 bytes, this is fixed by the next patch. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 21 +++++++++++---------- include/linux/mm_types.h | 2 +- kernel/exit.c | 8 ++++---- kernel/fork.c | 2 +- kernel/signal.c | 4 ++-- 5 files changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 71734568f01..50de3aaff4d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -722,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm) * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_sem around - * checking core_waiters and changing tsk->mm. The - * core-inducing thread will increment core_waiters for - * each thread whose ->mm == old_mm. + * checking core_state and changing tsk->mm. */ down_read(&old_mm->mmap_sem); - if (unlikely(old_mm->core_waiters)) { + if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); return -EINTR; } @@ -1514,7 +1512,7 @@ static void zap_process(struct task_struct *start) t = start; do { if (t != current && t->mm) { - t->mm->core_waiters++; + t->mm->core_state->nr_threads++; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } @@ -1538,11 +1536,11 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (err) return err; - if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + if (atomic_read(&mm->mm_users) == mm->core_state->nr_threads + 1) goto done; /* * We should find and kill all tasks which use this mm, and we should - * count them correctly into mm->core_waiters. We don't take tasklist + * count them correctly into ->nr_threads. We don't take tasklist * lock, but this is safe wrt: * * fork: @@ -1590,7 +1588,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, } rcu_read_unlock(); done: - return mm->core_waiters; + return mm->core_state->nr_threads; } static int coredump_wait(int exit_code) @@ -1603,9 +1601,12 @@ static int coredump_wait(int exit_code) init_completion(&mm->core_done); init_completion(&core_state.startup); + core_state.nr_threads = 0; mm->core_state = &core_state; core_waiters = zap_threads(tsk, mm, exit_code); + if (core_waiters < 0) + mm->core_state = NULL; up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1623,8 +1624,8 @@ static int coredump_wait(int exit_code) if (core_waiters) wait_for_completion(&core_state.startup); + mm->core_state = NULL; fail: - BUG_ON(mm->core_waiters); return core_waiters; } @@ -1702,7 +1703,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_waiters || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm)) { up_write(&mm->mmap_sem); goto fail; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 97819efd233..c0b1747b61a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -160,6 +160,7 @@ struct vm_area_struct { }; struct core_state { + int nr_threads; struct completion startup; }; @@ -179,7 +180,6 @@ struct mm_struct { atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ - int core_waiters; struct rw_semaphore mmap_sem; spinlock_t page_table_lock; /* Protects page tables and some counters */ diff --git a/kernel/exit.c b/kernel/exit.c index f7fa21dbced..988e232254e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + if (mm->core_state) { up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); - if (!--mm->core_waiters) + if (!--mm->core_state->nr_threads) complete(&mm->core_state->startup); up_write(&mm->mmap_sem); diff --git a/kernel/fork.c b/kernel/fork.c index eeaec6893b0..813d5c89b9d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; - mm->core_waiters = 0; + mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); diff --git a/kernel/signal.c b/kernel/signal.c index 39c1706edf0..5c7b7eaa0dc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1480,10 +1480,10 @@ static inline int may_ptrace_stop(void) * is a deadlock situation, and pointless because our tracer * is dead so don't allow us to stop. * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_waiters != 0. Otherwise it + * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). */ - if (unlikely(current->mm->core_waiters) && + if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) return 0; -- cgit v1.2.3-70-g09d2 From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:42 -0700 Subject: coredump: turn core_state->nr_threads into atomic_t Turn core_state->nr_threads into atomic_t and kill now unneeded down_write(&mm->mmap_sem) in exit_mm(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/mm_types.h | 2 +- kernel/exit.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index c74bb34eeef..15d493fe8aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1591,7 +1591,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, } rcu_read_unlock(); done: - core_state->nr_threads = nr; + atomic_set(&core_state->nr_threads, nr); return nr; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c0b1747b61a..ae99a28ba6a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -160,7 +160,7 @@ struct vm_area_struct { }; struct core_state { - int nr_threads; + atomic_t nr_threads; struct completion startup; }; diff --git a/kernel/exit.c b/kernel/exit.c index 988e232254e..63d82957baa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); if (mm->core_state) { up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_state->nr_threads) + + if (atomic_dec_and_test(&mm->core_state->nr_threads)) complete(&mm->core_state->startup); - up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit v1.2.3-70-g09d2 From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:44 -0700 Subject: coredump: construct the list of coredumping threads at startup time binfmt->core_dump() has to iterate over the all threads in system in order to find the coredumping threads and construct the list using the GFP_ATOMIC allocations. With this patch each thread allocates the list node on exit_mm()'s stack and adds itself to the list. This allows us to do further changes: - simplify ->core_dump() - change exit_mm() to clear ->mm first, then wait for ->core_done. this makes the coredumping process visible to oom_kill - kill mm->core_done Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ include/linux/mm_types.h | 6 ++++++ kernel/exit.c | 15 ++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index b8ee842d93c..fe2873b8037 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1604,6 +1604,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) init_completion(&mm->core_done); init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ae99a28ba6a..4d0d0abc79f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,8 +159,14 @@ struct vm_area_struct { #endif }; +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + struct core_state { atomic_t nr_threads; + struct core_thread dumper; struct completion startup; }; diff --git a/kernel/exit.c b/kernel/exit.c index 63d82957baa..b66f0d55c79 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -664,6 +664,7 @@ assign_new_owner: static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) @@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk) * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_state) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - if (atomic_dec_and_test(&mm->core_state->nr_threads)) - complete(&mm->core_state->startup); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit v1.2.3-70-g09d2 From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:46 -0700 Subject: coredump: kill mm->core_done Now that we have core_state->dumper list we can use it to wake up the sub-threads waiting for the coredump completion. This uglifies the code and .text grows by 47 bytes, but otoh mm_struct lessens by sizeof(struct completion). Also, with this change we can decouple exit_mm() from the coredumping code. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 25 ++++++++++++++++++++++--- include/linux/mm_types.h | 4 +--- kernel/exit.c | 8 +++++++- 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index fe2873b8037..bff43aeb235 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1602,7 +1602,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) struct completion *vfork_done; int core_waiters; - init_completion(&mm->core_done); init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; @@ -1628,6 +1627,27 @@ fail: return core_waiters; } +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + /* * set_dumpable converts traditional three-value dumpable to two flags and * stores them into mm->flags. It modifies lower two bits of mm->flags, but @@ -1812,8 +1832,7 @@ fail_unlock: argv_free(helper_argv); current->fsuid = fsuid; - complete_all(&mm->core_done); - mm->core_state = NULL; + coredump_finish(mm); fail: return retval; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d0d0abc79f..746f975b58e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -229,9 +229,7 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ - /* coredumping support */ - struct core_state *core_state; - struct completion core_done; + struct core_state *core_state; /* coredumping support */ /* aio bits */ rwlock_t ioctx_list_lock; /* aio lock */ diff --git a/kernel/exit.c b/kernel/exit.c index b66f0d55c79..8a4d4d12e29 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk) if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); - wait_for_completion(&mm->core_done); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); -- cgit v1.2.3-70-g09d2 From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 25 Jul 2008 01:48:49 -0700 Subject: task IO accounting: provide distinct tgid/tid I/O statistics Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate parent I/O statistics in /proc/pid/io. This approach follows the same model used to account per-process and per-thread CPU times. As a practial application, this allows for example to quickly find the top I/O consumer when a process spawns many child threads that perform the actual I/O work, because the aggregated I/O statistics can always be found in /proc/pid/io. [ Oleg Nesterov points out that we should check that the task is still alive before we iterate over the threads, but also says that we can do that fixup on top of this later. - Linus ] Acked-by: Balbir Singh Signed-off-by: Andrea Righi Cc: Matt Heaton Cc: Shailabh Nagar Acked-by-with-comments: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 86 ++++++++++++++++++++++++++++++++++++++++++--------- include/linux/sched.h | 4 +++ kernel/exit.c | 27 ++++++++++++++++ kernel/fork.c | 6 ++++ 4 files changed, 108 insertions(+), 15 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 58c3e6a8e15..a891fe4cb43 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, } #ifdef CONFIG_TASK_IO_ACCOUNTING -static int proc_pid_io_accounting(struct task_struct *task, char *buffer) -{ +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; + + if (!whole) { + rchar = task->rchar; + wchar = task->wchar; + syscr = task->syscr; + syscw = task->syscw; + memcpy(&ioac, &task->ioac, sizeof(ioac)); + } else { + unsigned long flags; + struct task_struct *t = task; + rchar = wchar = syscr = syscw = 0; + memset(&ioac, 0, sizeof(ioac)); + + rcu_read_lock(); + do { + rchar += t->rchar; + wchar += t->wchar; + syscr += t->syscr; + syscw += t->syscw; + + ioac.read_bytes += t->ioac.read_bytes; + ioac.write_bytes += t->ioac.write_bytes; + ioac.cancelled_write_bytes += + t->ioac.cancelled_write_bytes; + t = next_thread(t); + } while (t != task); + rcu_read_unlock(); + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + rchar += sig->rchar; + wchar += sig->wchar; + syscr += sig->syscr; + syscw += sig->syscw; + + ioac.read_bytes += sig->ioac.read_bytes; + ioac.write_bytes += sig->ioac.write_bytes; + ioac.cancelled_write_bytes += + sig->ioac.cancelled_write_bytes; + + unlock_task_sighand(task, &flags); + } + } + return sprintf(buffer, -#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" -#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", -#ifdef CONFIG_TASK_XACCT - (unsigned long long)task->rchar, - (unsigned long long)task->wchar, - (unsigned long long)task->syscr, - (unsigned long long)task->syscw, -#endif - (unsigned long long)task->ioac.read_bytes, - (unsigned long long)task->ioac.write_bytes, - (unsigned long long)task->ioac.cancelled_write_bytes); + (unsigned long long)rchar, + (unsigned long long)wchar, + (unsigned long long)syscr, + (unsigned long long)syscw, + (unsigned long long)ioac.read_bytes, + (unsigned long long)ioac.write_bytes, + (unsigned long long)ioac.cancelled_write_bytes); +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); } -#endif + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ /* * Thread groups @@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, pid_io_accounting), + INF("io", S_IRUGO, tgid_io_accounting), #endif }; @@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, tid_io_accounting), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/include/linux/sched.h b/include/linux/sched.h index af780f299c7..d22ffe06d0e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -506,6 +506,10 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; +#ifdef CONFIG_TASK_XACCT + u64 rchar, wchar, syscr, syscw; +#endif + struct task_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the diff --git a/kernel/exit.c b/kernel/exit.c index 8a4d4d12e29..ad933bb29ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); +#ifdef CONFIG_TASK_XACCT + sig->rchar += tsk->rchar; + sig->wchar += tsk->wchar; + sig->syscr += tsk->syscr; + sig->syscw += tsk->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + sig->ioac.read_bytes += tsk->ioac.read_bytes; + sig->ioac.write_bytes += tsk->ioac.write_bytes; + sig->ioac.cancelled_write_bytes += + tsk->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; +#ifdef CONFIG_TASK_XACCT + psig->rchar += p->rchar + sig->rchar; + psig->wchar += p->wchar + sig->wchar; + psig->syscr += p->syscr + sig->syscr; + psig->syscw += p->syscw + sig->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + psig->ioac.read_bytes += + p->ioac.read_bytes + sig->ioac.read_bytes; + psig->ioac.write_bytes += + p->ioac.write_bytes + sig->ioac.write_bytes; + psig->ioac.cancelled_write_bytes += + p->ioac.cancelled_write_bytes + + sig->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 813d5c89b9d..b99d73e971a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; +#ifdef CONFIG_TASK_XACCT + sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + memset(&sig->ioac, 0, sizeof(sig->ioac)); +#endif sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); -- cgit v1.2.3-70-g09d2