diff options
Diffstat (limited to 'fs/proc')
35 files changed, 6445 insertions, 3333 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 50f8f0600f0..2183fcf41d5 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -1,5 +1,5 @@ config PROC_FS - bool "/proc file system support" if EMBEDDED + bool "/proc file system support" if EXPERT default y help This is a virtual file system providing information about the status @@ -31,16 +31,20 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + help + Provides a virtual ELF core file of the live kernel. This can + be read with gdb and other ELF tools. No modifications can be + made using this mechanism. config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && CRASH_DUMP + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. config PROC_SYSCTL - bool "Sysctl support (/proc/sys)" if EMBEDDED + bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS select SYSCTL default y @@ -61,7 +65,7 @@ config PROC_SYSCTL config PROC_PAGE_MONITOR default y depends on PROC_FS && MMU - bool "Enable /proc page monitoring" if EMBEDDED + bool "Enable /proc page monitoring" if EXPERT help Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 11a7b5c6815..239493ec718 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -2,14 +2,16 @@ # Makefile for the Linux proc filesystem routines. # -obj-$(CONFIG_PROC_FS) += proc.o +obj-y += proc.o proc-y := nommu.o task_nommu.o -proc-$(CONFIG_MMU) := mmu.o task_mmu.o +proc-$(CONFIG_MMU) := task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o + fd.o +proc-$(CONFIG_TTY) += proc_tty.o proc-y += cmdline.o +proc-y += consoles.o proc-y += cpuinfo.o proc-y += devices.o proc-y += interrupts.o @@ -19,10 +21,11 @@ proc-y += stat.o proc-y += uptime.o proc-y += version.o proc-y += softirqs.o +proc-y += namespaces.o +proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o -proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 13b5d070817..64db2bceac5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -68,7 +68,6 @@ #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/swap.h> -#include <linux/slab.h> #include <linux/smp.h> #include <linux/signal.h> #include <linux/highmem.h> @@ -82,7 +81,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> -#include <linux/swapops.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -97,7 +96,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) get_task_comm(tcomm, p); - seq_printf(m, "Name:\t"); + seq_puts(m, "Name:\t"); end = m->buf + m->size; buf = m->buf + m->count; name = tcomm; @@ -124,7 +123,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) buf++; } m->count = buf - m->buf; - seq_printf(m, "\n"); + seq_putc(m, '\n'); } /* @@ -133,36 +132,29 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * you can test for combinations of others with * simple bit tests. */ -static const char *task_state_array[] = { +static const char * const task_state_array[] = { "R (running)", /* 0 */ "S (sleeping)", /* 1 */ "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ - "x (dead)", /* 64 */ - "K (wakekill)", /* 128 */ - "W (waking)", /* 256 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; - const char **p = &task_state_array[0]; + unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - while (state) { - p++; - state >>= 1; - } - return *p; + return task_state_array[fls(state)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { + struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -174,14 +166,15 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; tpid = 0; if (pid_alive(p)) { - struct task_struct *tracer = tracehook_tracer_task(p); + struct task_struct *tracer = ptrace_parent(p); if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" + "Ngid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" "TracerPid:\t%d\n" @@ -189,10 +182,17 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), task_tgid_nr_ns(p, ns), + task_numa_group_id(p), pid_nr_ns(pid, ns), ppid, tpid, - cred->uid, cred->euid, cred->suid, cred->fsuid, - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid)); task_lock(p); if (p->files) @@ -206,19 +206,20 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; task_unlock(p); - for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) - seq_printf(m, "%d ", GROUP_AT(group_info, g)); + for (g = 0; g < group_info->ngroups; g++) + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } -static void render_sigset_t(struct seq_file *m, const char *header, +void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; - seq_printf(m, "%s", header); + seq_puts(m, header); i = _NSIG; do { @@ -232,7 +233,7 @@ static void render_sigset_t(struct seq_file *m, const char *header, seq_printf(m, "%x", x); } while (i >= 4); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, @@ -269,9 +270,11 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) shpending = p->signal->shared_pending.signal; blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); - num_threads = atomic_read(&p->signal->count); + num_threads = get_nr_threads(p); + rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); - qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; + rcu_read_unlock(); + qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } @@ -291,14 +294,18 @@ static void render_cap_t(struct seq_file *m, const char *header, { unsigned __capi; - seq_printf(m, "%s", header); + seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); } +/* Remove non-existent capabilities */ +#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ + CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) + static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -312,12 +319,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); + NORM_CAPS(cap_inheritable); + NORM_CAPS(cap_permitted); + NORM_CAPS(cap_effective); + NORM_CAPS(cap_bset); + render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); render_cap_t(m, "CapBnd:\t", &cap_bset); } +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); +#endif +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -329,12 +348,12 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); + seq_puts(m, "Cpus_allowed:\t"); seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); + seq_putc(m, '\n'); + seq_puts(m, "Cpus_allowed_list:\t"); seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -351,11 +370,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); -#if defined(CONFIG_S390) - task_show_regs(m, task); -#endif task_context_switch_counts(m, task); return 0; } @@ -364,7 +381,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; - long priority, nice; + int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; char state; @@ -383,7 +400,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ); + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); @@ -397,8 +414,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; - cgtime = gtime = cputime_zero; + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -410,7 +427,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, tty_nr = new_encode_dev(tty_devnum(sig->tty)); } - num_threads = atomic_read(&sig->count); + num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); cmin_flt = sig->cmin_flt; @@ -418,7 +435,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { @@ -426,14 +443,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, t->gtime); - t = next_thread(t); - } while (t != task); + gtime += task_gtime(t); + } while_each_thread(task, t); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); - gtime = cputime_add(gtime, sig->gtime); + thread_group_cputime_adjusted(task, &utime, &stime); + gtime += sig->gtime; } sid = task_session_nr_ns(task, ns); @@ -448,8 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); - gtime = task->gtime; + task_cputime_adjusted(task, &utime, &stime); + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ @@ -465,56 +481,70 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); - seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ -%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", - pid_nr_ns(pid, ns), - tcomm, - state, - ppid, - pgid, - sid, - tty_nr, - tty_pgrp, - task->flags, - min_flt, - cmin_flt, - maj_flt, - cmaj_flt, - cputime_to_clock_t(utime), - cputime_to_clock_t(stime), - cputime_to_clock_t(cutime), - cputime_to_clock_t(cstime), - priority, - nice, - num_threads, - start_time, - vsize, - mm ? get_mm_rss(mm) : 0, - rsslim, - mm ? mm->start_code : 0, - mm ? mm->end_code : 0, - (permitted && mm) ? task->stack_start : 0, - esp, - eip, - /* The signal information here is obsolete. - * It must be decimal for Linux 2.0 compatibility. - * Use /proc/#/status for real-time signals. - */ - task->pending.signal.sig[0] & 0x7fffffffUL, - task->blocked.sig[0] & 0x7fffffffUL, - sigign .sig[0] & 0x7fffffffUL, - sigcatch .sig[0] & 0x7fffffffUL, - wchan, - 0UL, - 0UL, - task->exit_signal, - task_cpu(task), - task->rt_priority, - task->policy, - (unsigned long long)delayacct_blkio_ticks(task), - cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ll(m, ' ', ppid); + seq_put_decimal_ll(m, ' ', pgid); + seq_put_decimal_ll(m, ' ', sid); + seq_put_decimal_ll(m, ' ', tty_nr); + seq_put_decimal_ll(m, ' ', tty_pgrp); + seq_put_decimal_ull(m, ' ', task->flags); + seq_put_decimal_ull(m, ' ', min_flt); + seq_put_decimal_ull(m, ' ', cmin_flt); + seq_put_decimal_ull(m, ' ', maj_flt); + seq_put_decimal_ull(m, ' ', cmaj_flt); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, ' ', priority); + seq_put_decimal_ll(m, ' ', nice); + seq_put_decimal_ll(m, ' ', num_threads); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', start_time); + seq_put_decimal_ull(m, ' ', vsize); + seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', rsslim); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, ' ', esp); + seq_put_decimal_ull(m, ' ', eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', wchan); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ll(m, ' ', task->exit_signal); + seq_put_decimal_ll(m, ' ', task_cpu(task)); + seq_put_decimal_ull(m, ' ', task->rt_priority); + seq_put_decimal_ull(m, ' ', task->policy); + seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + + if (mm && permitted) { + seq_put_decimal_ull(m, ' ', mm->start_data); + seq_put_decimal_ull(m, ' ', mm->end_data); + seq_put_decimal_ull(m, ' ', mm->start_brk); + seq_put_decimal_ull(m, ' ', mm->arg_start); + seq_put_decimal_ull(m, ' ', mm->arg_end); + seq_put_decimal_ull(m, ' ', mm->env_start); + seq_put_decimal_ull(m, ' ', mm->env_end); + } else + seq_printf(m, " 0 0 0 0 0 0 0"); + + if (permitted) + seq_put_decimal_ll(m, ' ', task->exit_code); + else + seq_put_decimal_ll(m, ' ', 0); + + seq_putc(m, '\n'); if (mm) mmput(mm); return 0; @@ -535,15 +565,150 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - seq_printf(m, "%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, 0, size); + seq_put_decimal_ull(m, ' ', resident); + seq_put_decimal_ull(m, ' ', shared); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', data); + seq_put_decimal_ull(m, ' ', 0); + seq_putc(m, '\n'); + + return 0; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +static struct pid * +get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +{ + struct task_struct *start, *task; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + /* + * Lets try to continue searching first, this gives + * us significant speedup on children-rich processes. + */ + if (pid_prev) { + task = pid_task(pid_prev, PIDTYPE_PID); + if (task && task->real_parent == start && + !(list_empty(&task->sibling))) { + if (list_is_last(&task->sibling, &start->children)) + goto out; + task = list_first_entry(&task->sibling, + struct task_struct, sibling); + pid = get_pid(task_pid(task)); + goto out; + } + } + + /* + * Slow search case. + * + * We might miss some children here if children + * are exited while we were not holding the lock, + * but it was never promised to be accurate that + * much. + * + * "Just suppose that the parent sleeps, but N children + * exit after we printed their tids. Now the slow paths + * skips N extra children, we miss N tasks." (c) + * + * So one need to stop or freeze the leader and all + * its children to get a precise result. + */ + list_for_each_entry(task, &start->children, sibling) { + if (pos-- == 0) { + pid = get_pid(task_pid(task)); + break; + } + } + +out: + read_unlock(&tasklist_lock); + return pid; +} + +static int children_seq_show(struct seq_file *seq, void *v) +{ + struct inode *inode = seq->private; + pid_t pid; + pid = pid_nr_ns(v, inode->i_sb->s_fs_info); + return seq_printf(seq, "%d ", pid); +} + +static void *children_seq_start(struct seq_file *seq, loff_t *pos) +{ + return get_children_pid(seq->private, NULL, *pos); +} + +static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct pid *pid; + + pid = get_children_pid(seq->private, v, *pos + 1); + put_pid(v); + + ++*pos; + return pid; +} + +static void children_seq_stop(struct seq_file *seq, void *v) +{ + put_pid(v); +} + +static const struct seq_operations children_seq_ops = { + .start = children_seq_start, + .next = children_seq_next, + .stop = children_seq_stop, + .show = children_seq_show, +}; + +static int children_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &children_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = inode; + + return ret; +} + +int children_seq_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); return 0; } + +const struct file_operations proc_tid_children_operations = { + .open = children_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = children_seq_release, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 18d5cc62d8e..2d696b0c93b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -63,6 +63,7 @@ #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> @@ -72,6 +73,7 @@ #include <linux/security.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/printk.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/audit.h> @@ -80,8 +82,17 @@ #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/flex_array.h> +#include <linux/posix-timers.h> +#ifdef CONFIG_HARDWALL +#include <asm/hardwall.h> +#endif +#include <trace/events/oom.h> #include "internal.h" +#include "fd.h" /* NOTE: * Implementing inode permission operations in /proc is almost @@ -96,7 +107,7 @@ struct pid_entry { char *name; int len; - mode_t mode; + umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; @@ -147,151 +158,58 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, return count; } -static int get_fs_path(struct task_struct *task, struct path *path, bool root) +static int get_task_root(struct task_struct *task, struct path *root) { - struct fs_struct *fs; int result = -ENOENT; task_lock(task); - fs = task->fs; - if (fs) { - read_lock(&fs->lock); - *path = root ? fs->root : fs->pwd; - path_get(path); - read_unlock(&fs->lock); + if (task->fs) { + get_fs_root(task->fs, root); result = 0; } task_unlock(task); return result; } -static int get_nr_threads(struct task_struct *tsk) -{ - unsigned long flags; - int count = 0; - - if (lock_task_sighand(tsk, &flags)) { - count = atomic_read(&tsk->signal->count); - unlock_task_sighand(tsk, &flags); - } - return count; -} - -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { - result = get_fs_path(task, path, 0); + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; + } + task_unlock(task); put_task_struct(task); } return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { - result = get_fs_path(task, path, 1); + result = get_task_root(task, path); put_task_struct(task); } return result; } -/* - * Return zero if current may access user memory in @task, -error if not. - */ -static int check_mem_permission(struct task_struct *task) +static int proc_pid_cmdline(struct task_struct *task, char *buffer) { - /* - * A task can always look at itself, in case it chooses - * to use system calls instead of load instructions. - */ - if (task == current) - return 0; - - /* - * If current is actively ptrace'ing, and would also be - * permitted to freshly attach with ptrace now, permit it. - */ - if (task_is_stopped_or_traced(task)) { - int match; - rcu_read_lock(); - match = (tracehook_tracer_task(task) == current); - rcu_read_unlock(); - if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) - return 0; - } - - /* - * Noone else is allowed. - */ - return -EPERM; -} - -struct mm_struct *mm_for_maps(struct task_struct *task) -{ - struct mm_struct *mm; - - if (mutex_lock_killable(&task->cred_guard_mutex)) - return NULL; - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, PTRACE_MODE_READ)) { - mmput(mm); - mm = NULL; - } - mutex_unlock(&task->cred_guard_mutex); - - return mm; -} - -static int proc_pid_cmdline(struct task_struct *task, char * buffer) -{ - int res = 0; - unsigned int len; - struct mm_struct *mm = get_task_mm(task); - if (!mm) - goto out; - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - - res = access_process_vm(task, mm->arg_start, buffer, len, 0); - - // If the nul at the end of args has been overwritten, then - // assume application is using setproctitle(3). - if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { - len = strnlen(buffer, res); - if (len < res) { - res = len; - } else { - len = mm->env_end - mm->env_start; - if (len > PAGE_SIZE - res) - len = PAGE_SIZE - res; - res += access_process_vm(task, mm->env_start, buffer+res, len, 0); - res = strnlen(buffer, res); - } - } -out_mm: - mmput(mm); -out: - return res; + return get_cmdline(task, buffer, PAGE_SIZE); } static int proc_pid_auxv(struct task_struct *task, char *buffer) { - int res = 0; - struct mm_struct *mm = get_task_mm(task); - if (mm) { + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + int res = PTR_ERR(mm); + if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { nwords += 2; @@ -328,6 +246,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) } #endif /* CONFIG_KALLSYMS */ +static int lock_trace(struct task_struct *task) +{ + int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + mutex_unlock(&task->signal->cred_guard_mutex); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + mutex_unlock(&task->signal->cred_guard_mutex); +} + #ifdef CONFIG_STACKTRACE #define MAX_STACK_TRACE_DEPTH 64 @@ -337,6 +272,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, { struct stack_trace trace; unsigned long *entries; + int err; int i; entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); @@ -347,15 +283,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, trace.max_entries = MAX_STACK_TRACE_DEPTH; trace.entries = entries; trace.skip = 0; - save_stack_trace_tsk(task, &trace); - for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%p>] %pS\n", - (void *)entries[i], (void *)entries[i]); + err = lock_trace(task); + if (!err) { + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%pK>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + unlock_trace(task); } kfree(entries); - return 0; + return err; } #endif @@ -383,26 +324,20 @@ static int lstats_show_proc(struct seq_file *m, void *v) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < 32; i++) { - if (task->latency_record[i].backtrace[0]) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %li %li ", - task->latency_record[i].count, - task->latency_record[i].time, - task->latency_record[i].max); + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!task->latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (task->latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, task->latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); } } @@ -418,7 +353,7 @@ static int lstats_open(struct inode *inode, struct file *file) static ssize_t lstats_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -438,16 +373,46 @@ static const struct file_operations proc_lstats_operations = { #endif -/* The badness from the OOM killer */ -unsigned long badness(struct task_struct *p, unsigned long uptime); +#ifdef CONFIG_CGROUPS +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +static const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_PROC_PID_CPUSET + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); +} + +static const struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_oom_score(struct task_struct *task, char *buffer) { - unsigned long points; - struct timespec uptime; + unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long points = 0; - do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); - points = badness(task->group_leader, uptime.tv_sec); + if (pid_alive(task)) + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } @@ -526,18 +491,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer) { long nr; unsigned long args[6], sp, pc; + int res = lock_trace(task); + if (res) + return res; if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) - return sprintf(buffer, "running\n"); - - if (nr < 0) - return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); - - return sprintf(buffer, + res = sprintf(buffer, "running\n"); + else if (nr < 0) + res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else + res = sprintf(buffer, "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", nr, args[0], args[1], args[2], args[3], args[4], args[5], sp, pc); + unlock_trace(task); + return res; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ @@ -562,7 +531,7 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -static int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = dentry->d_inode; @@ -571,133 +540,62 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; error = inode_change_ok(inode, attr); - if (!error) - error = inode_setattr(inode, attr); - return error; -} - -static const struct inode_operations proc_def_inode_operations = { - .setattr = proc_setattr, -}; - -static int mounts_open_common(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - struct task_struct *task = get_proc_task(inode); - struct nsproxy *nsp; - struct mnt_namespace *ns = NULL; - struct path root; - struct proc_mounts *p; - int ret = -EINVAL; - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - ns = nsp->mnt_ns; - if (ns) - get_mnt_ns(ns); - } - rcu_read_unlock(); - if (ns && get_fs_path(task, &root, 1) == 0) - ret = 0; - put_task_struct(task); - } - - if (!ns) - goto err; - if (ret) - goto err_put_ns; - - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) - goto err_put_path; - - file->private_data = &p->m; - ret = seq_open(file, op); - if (ret) - goto err_free; - - p->m.private = p; - p->ns = ns; - p->root = root; - p->event = ns->event; + if (error) + return error; + setattr_copy(inode, attr); + mark_inode_dirty(inode); return 0; - - err_free: - kfree(p); - err_put_path: - path_put(&root); - err_put_ns: - put_mnt_ns(ns); - err: - return ret; } -static int mounts_release(struct inode *inode, struct file *file) +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) { - struct proc_mounts *p = file->private_data; - path_put(&p->root); - put_mnt_ns(p->ns); - return seq_release(inode, file); + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); } -static unsigned mounts_poll(struct file *file, poll_table *wait) -{ - struct proc_mounts *p = file->private_data; - struct mnt_namespace *ns = p->ns; - unsigned res = POLLIN | POLLRDNORM; - - poll_wait(file, &ns->poll, wait); - - spin_lock(&vfsmount_lock); - if (p->event != ns->event) { - p->event = ns->event; - res |= POLLERR | POLLPRI; - } - spin_unlock(&vfsmount_lock); - return res; -} - -static int mounts_open(struct inode *inode, struct file *file) +static int proc_pid_permission(struct inode *inode, int mask) { - return mounts_open_common(inode, file, &mounts_op); -} + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; -static const struct file_operations proc_mounts_operations = { - .open = mounts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); -static int mountinfo_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountinfo_op); + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); } -static const struct file_operations proc_mountinfo_operations = { - .open = mountinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; -static int mountstats_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountstats_op); -} -static const struct file_operations proc_mountstats_operations = { - .open = mountstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, }; #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ @@ -705,7 +603,7 @@ static const struct file_operations proc_mountstats_operations = { static ssize_t proc_info_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); unsigned long page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -734,6 +632,7 @@ out_no_task: static const struct file_operations proc_info_file_operations = { .read = proc_info_read, + .llseek = generic_file_llseek, }; static int proc_single_show(struct seq_file *m, void *v) @@ -758,14 +657,7 @@ static int proc_single_show(struct seq_file *m, void *v) static int proc_single_open(struct inode *inode, struct file *filp) { - int ret; - ret = single_open(filp, proc_single_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; + return single_open(filp, proc_single_show, inode); } static const struct file_operations proc_single_file_operations = { @@ -775,130 +667,105 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; -static int mem_open(struct inode* inode, struct file* file) -{ - file->private_data = (void*)((long)current->self_exec_id); - return 0; -} - -static ssize_t mem_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char *page; - unsigned long src = *ppos; - int ret = -ESRCH; + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; if (!task) - goto out_no_task; + return -ESRCH; - if (check_mem_permission(task)) - goto out; + mm = mm_access(task, mode); + put_task_struct(task); - ret = -ENOMEM; - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - goto out; + if (IS_ERR(mm)) + return PTR_ERR(mm); - ret = 0; - - mm = get_task_mm(task); - if (!mm) - goto out_free; + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } - ret = -EIO; - - if (file->private_data != (void*)((long)current->self_exec_id)) - goto out_put; + file->private_data = mm; - ret = 0; - - while (count > 0) { - int this_len, retval; + return 0; +} - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_process_vm(task, src, page, this_len, 0); - if (!retval || check_mem_permission(task)) { - if (!ret) - ret = -EIO; - break; - } +static int mem_open(struct inode *inode, struct file *file) +{ + int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - if (copy_to_user(buf, page, retval)) { - ret = -EFAULT; - break; - } - - ret += retval; - src += retval; - buf += retval; - count -= retval; - } - *ppos = src; + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; -out_put: - mmput(mm); -out_free: - free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return ret; } -#define mem_write NULL - -#ifndef mem_write -/* This is a security hazard */ -static ssize_t mem_write(struct file * file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) { - int copied; + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; char *page; - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - unsigned long dst = *ppos; - - copied = -ESRCH; - if (!task) - goto out_no_task; - if (check_mem_permission(task)) - goto out; + if (!mm) + return 0; - copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; + return -ENOMEM; copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { - int this_len, retval; + int this_len = min_t(int, count, PAGE_SIZE); - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } - retval = access_process_vm(task, dst, page, this_len, 1); - if (!retval) { + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { if (!copied) copied = -EIO; break; } - copied += retval; - buf += retval; - dst += retval; - count -= retval; + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; } - *ppos = dst; + *ppos = addr; + + mmput(mm); +free: free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return copied; } -#endif + +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} loff_t mem_lseek(struct file *file, loff_t offset, int orig) { @@ -916,51 +783,58 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, + .release = mem_release, }; +static int environ_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ); +} + static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; - int ret = -ESRCH; - struct mm_struct *mm; - - if (!task) - goto out_no_task; + int ret = 0; + struct mm_struct *mm = file->private_data; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; + if (!mm) + return 0; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; + return -ENOMEM; ret = 0; - - mm = get_task_mm(task); - if (!mm) - goto out_free; - + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; while (count > 0) { - int this_len, retval, max_len; + size_t this_len, max_len; + int retval; - this_len = mm->env_end - (mm->env_start + src); - - if (this_len <= 0) + if (src >= (mm->env_end - mm->env_start)) break; - max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - this_len = (this_len > max_len) ? max_len : this_len; + this_len = mm->env_end - (mm->env_start + src); + + max_len = min_t(size_t, PAGE_SIZE, count); + this_len = min(max_len, this_len); - retval = access_process_vm(task, (mm->env_start + src), + retval = access_remote_vm(mm, (mm->env_start + src), page, this_len, 0); if (retval <= 0) { @@ -979,91 +853,214 @@ static ssize_t environ_read(struct file *file, char __user *buf, count -= retval; } *ppos = src; - mmput(mm); -out_free: + +free: free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return ret; } static const struct file_operations proc_environ_operations = { + .open = environ_open, .read = environ_read, + .llseek = generic_file_llseek, + .release = mem_release, }; -static ssize_t oom_adjust_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; size_t len; - int oom_adjust = OOM_DISABLE; unsigned long flags; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - oom_adjust = task->signal->oom_adj; + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; unlock_task_sighand(task, &flags); } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file_inode(file)); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; - len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + short oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } -static ssize_t oom_adjust_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF]; - long oom_adjust; unsigned long flags; + int oom_score_adj; int err; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } - err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) - return -EINVAL; - if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) - return -EINVAL; + goto out; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file_inode(file)); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; if (!lock_task_sighand(task, &flags)) { - put_task_struct(task); - return -ESRCH; + err = -ESRCH; + goto err_task_lock; } - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - unlock_task_sighand(task, &flags); - put_task_struct(task); - return -EACCES; + if ((short)oom_score_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; } - task->signal->oom_adj = oom_adjust; + task->signal->oom_score_adj = (short)oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_score_adj; + trace_oom_score_adj_update(task); +err_sighand: unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); put_task_struct(task); - - return count; +out: + return err < 0 ? err : count; } -static const struct file_operations proc_oom_adjust_operations = { - .read = oom_adjust_read, - .write = oom_adjust_write, +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, + .llseek = default_llseek, }; #ifdef CONFIG_AUDITSYSCALL @@ -1071,7 +1068,7 @@ static const struct file_operations proc_oom_adjust_operations = { static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; @@ -1079,7 +1076,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - audit_get_loginuid(task)); + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1087,16 +1085,18 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *page, *tmp; ssize_t length; uid_t loginuid; + kuid_t kloginuid; - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - - if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); if (count >= PAGE_SIZE) count = PAGE_SIZE - 1; @@ -1119,7 +1119,19 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(current, loginuid); + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } + } + + length = audit_set_loginuid(kloginuid); if (likely(length == 0)) length = count; @@ -1131,12 +1143,13 @@ out_free_page: static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, + .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; @@ -1151,6 +1164,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf, static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, + .llseek = generic_file_llseek, }; #endif @@ -1158,7 +1172,7 @@ static const struct file_operations proc_sessionid_operations = { static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; size_t len; int make_it_fail; @@ -1190,7 +1204,10 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - task = get_proc_task(file->f_dentry->d_inode); + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; task->make_it_fail = make_it_fail; @@ -1202,6 +1219,7 @@ static ssize_t proc_fault_inject_write(struct file * file, static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, + .llseek = generic_file_llseek, }; #endif @@ -1229,7 +1247,7 @@ static ssize_t sched_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct task_struct *p; p = get_proc_task(inode); @@ -1244,9 +1262,76 @@ sched_write(struct file *file, const char __user *buf, static int sched_open(struct inode *inode, struct file *filp) { + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ int ret; - ret = single_open(filp, sched_show, NULL); + ret = single_open(filp, sched_autogroup_show, NULL); if (!ret) { struct seq_file *m = filp->private_data; @@ -1255,27 +1340,26 @@ static int sched_open(struct inode *inode, struct file *filp) return ret; } -static const struct file_operations proc_pid_sched_operations = { - .open = sched_open, +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, .read = seq_read, - .write = sched_write, + .write = sched_autogroup_write, .llseek = seq_lseek, .release = single_release, }; -#endif +#endif /* CONFIG_SCHED_AUTOGROUP */ static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); @@ -1312,15 +1396,7 @@ static int comm_show(struct seq_file *m, void *v) static int comm_open(struct inode *inode, struct file *filp) { - int ret; - - ret = single_open(filp, comm_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; + return single_open(filp, comm_show, inode); } static const struct file_operations proc_pid_set_comm_operations = { @@ -1331,64 +1407,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ - if (new_exe_file) - get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ - struct file *exe_file; - - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); - return exe_file; -} - -void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1409,17 +1434,19 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; + struct path path; int error = -EACCES; - /* We don't need a base pointer in the /proc filesystem */ - path_put(&nd->path); - /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); - nd->last_type = LAST_BIND; + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + nd_jump_link(nd, &path); + return NULL; out: return ERR_PTR(error); } @@ -1458,7 +1485,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1468,7 +1495,7 @@ out: return error; } -static const struct inode_operations proc_pid_link_inode_operations = { +const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, @@ -1477,23 +1504,7 @@ static const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -static int task_dumpable(struct task_struct *task) -{ - int dumpable = 0; - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) - dumpable = get_dumpable(mm); - task_unlock(task); - if(dumpable == 1) - return 1; - return 0; -} - - -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; @@ -1507,6 +1518,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); + inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_op = &proc_def_inode_operations; @@ -1534,19 +1546,28 @@ out_unlock: return NULL; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -1575,12 +1596,18 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat * made this apply to all per process world readable and executable * directories. */ -static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, unsigned int flags) { - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct inode *inode; + struct task_struct *task; const struct cred *cred; + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (task) { if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { @@ -1590,8 +1617,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -1602,16 +1629,21 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_delete_dentry(struct dentry * dentry) +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + +int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; + return proc_inode_is_dead(dentry->d_inode); } -static const struct dentry_operations pid_dentry_operations = +const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, @@ -1619,9 +1651,6 @@ static const struct dentry_operations pid_dentry_operations = /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); - /* * Fill a directory entry. * @@ -1634,409 +1663,455 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, * reported by readdir in sync with the inode numbers reported * by stat. */ -static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - char *name, int len, +bool proc_fill_cache(struct file *file, struct dir_context *ctx, + const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; + struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - struct qstr qname; - ino_t ino = 0; - unsigned type = DT_UNKNOWN; - - qname.name = name; - qname.len = len; - qname.hash = full_name_hash(name, len); + unsigned type; + ino_t ino; - child = d_lookup(dir, &qname); + child = d_hash_and_lookup(dir, &qname); if (!child) { - struct dentry *new; - new = d_alloc(dir, &qname); - if (new) { - child = instantiate(dir->d_inode, new, task, ptr); - if (child) - dput(new); - else - child = new; + child = d_alloc(dir, &qname); + if (!child) + goto end_instantiate; + if (instantiate(dir->d_inode, child, task, ptr) < 0) { + dput(child); + goto end_instantiate; } } - if (!child || IS_ERR(child) || !child->d_inode) - goto end_instantiate; inode = child->d_inode; - if (inode) { - ino = inode->i_ino; - type = inode->i_mode >> 12; - } + ino = inode->i_ino; + type = inode->i_mode >> 12; dput(child); + return dir_emit(ctx, name, len, ino, type); + end_instantiate: - if (!ino) - ino = find_inode_number(dir, &qname); - if (!ino) - ino = 1; - return filldir(dirent, name, len, filp->f_pos, ino, type); + return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } -static unsigned name_to_int(struct dentry *dentry) +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) { - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; - unsigned n = 0; + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; + return 0; } -#define PROC_FDINFO_MAX 64 - -static int proc_fd_info(struct inode *inode, struct path *path, char *info) +static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; - struct file *file; - int fd = proc_fd(inode); + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - file->f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; - } - spin_unlock(&files->file_lock); - put_files_struct(files); + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EPERM; + goto out_notask; } - return -ENOENT; -} -static int proc_fd_link(struct inode *inode, struct path *path) -{ - return proc_fd_info(inode, path, NULL); -} + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; -static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int fd = proc_fd(inode); - struct files_struct *files; - const struct cred *cred; + mm = mm_access(task, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(mm)) + goto out; - if (task) { - files = get_files_struct(task); - if (files) { + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { rcu_read_lock(); - if (fcheck_files(files, fd)) { - rcu_read_unlock(); - put_files_struct(files); - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - inode->i_mode &= ~(S_ISUID | S_ISGID); - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; rcu_read_unlock(); - put_files_struct(files); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } - put_task_struct(task); + security_task_to_inode(task, inode); + status = 1; } - d_drop(dentry); - return 0; + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; } -static const struct dentry_operations tid_fd_dentry_operations = -{ - .d_revalidate = tid_fd_revalidate, +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, .d_delete = pid_delete_dentry, }; -static struct dentry *proc_fd_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) { - unsigned fd = *(const unsigned *)ptr; - struct file *file; - struct files_struct *files; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + rc = -ENOENT; + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + fmode_t mode; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static int +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + fmode_t mode = (fmode_t)(unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - files = get_files_struct(task); - if (!files) - goto out_iput; - inode->i_mode = S_IFLNK; + return -ENOENT; - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (!file) - goto out_unlock; - if (file->f_mode & FMODE_READ) - inode->i_mode |= S_IRUSR | S_IXUSR; - if (file->f_mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR | S_IXUSR; - spin_unlock(&files->file_lock); - put_files_struct(files); + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; - ei->op.proc_get_link = proc_fd_link; - dentry->d_op = &tid_fd_dentry_operations; + inode->i_mode = S_IFLNK; + + if (mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) - error = NULL; - out: - return error; -out_unlock: - spin_unlock(&files->file_lock); - put_files_struct(files); -out_iput: - iput(inode); - goto out; + return 0; } -static struct dentry *proc_lookupfd_common(struct inode *dir, - struct dentry *dentry, - instantiate_t instantiate) +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { - struct task_struct *task = get_proc_task(dir); - unsigned fd = name_to_int(dentry); - struct dentry *result = ERR_PTR(-ENOENT); + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + int result; + struct mm_struct *mm; + + result = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + result = -ENOENT; + task = get_proc_task(dir); if (!task) - goto out_no_task; - if (fd == ~0U) goto out; - result = instantiate(dir, dentry, task, &fd); -out: + result = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + result = -ENOENT; + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_put_task: put_task_struct(task); -out_no_task: - return result; +out: + return ERR_PTR(result); } -static int proc_readfd_common(struct file * filp, void * dirent, - filldir_t filldir, instantiate_t instantiate) +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); - unsigned int fd, ino; - int retval; - struct files_struct * files; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + int ret; - retval = -ENOENT; - if (!p) - goto out_no_task; - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = filp->f_pos-2; - fd < files_fdtable(files)->max_fds; - fd++, filp->f_pos++) { - char name[PROC_NUMBUF]; - int len; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); - - len = snprintf(name, sizeof(name), "%d", fd); - if (proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, - p, &fd) < 0) { - rcu_read_lock(); - break; - } - rcu_read_lock(); - } - rcu_read_unlock(); - put_files_struct(files); + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(file_inode(file)); + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ret = 0; + if (!dir_emit_dots(file, ctx)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > ctx->pos) + nr_files++; } -out: - put_task_struct(p); -out_no_task: - return retval; -} -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); -} + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_put_task; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; + + info.mode = vma->vm_file->f_mode; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); -} + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + if (!proc_fill_cache(file, ctx, + p->name, p->len, + proc_map_files_instantiate, + task, + (void *)(unsigned long)p->mode)) + break; + ctx->pos++; + } + if (fa) + flex_array_free(fa); + mmput(mm); -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - char tmp[PROC_FDINFO_MAX]; - int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); - if (!err) - err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); - return err; +out_put_task: + put_task_struct(task); +out: + return ret; } -static const struct file_operations proc_fdinfo_file_operations = { - .open = nonseekable_open, - .read = proc_fdinfo_read, +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .iterate = proc_map_files_readdir, + .llseek = default_llseek, }; -static const struct file_operations proc_fd_operations = { - .read = generic_read_dir, - .readdir = proc_readfd, +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; }; -/* - * /proc/pid/fd needs a special permission handler so that a process can still - * access /proc/self/fd after it has executed a setuid(). - */ -static int proc_fd_permission(struct inode *inode, int mask) +static void *timers_start(struct seq_file *m, loff_t *pos) { - int rv; + struct timers_private *tp = m->private; - rv = generic_permission(inode, mask, NULL); - if (rv == 0) - return 0; - if (task_pid(current) == proc_pid(inode)) - rv = 0; - return rv; + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); } -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fd_inode_operations = { - .lookup = proc_lookupfd, - .permission = proc_fd_permission, - .setattr = proc_setattr, -}; +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} -static struct dentry *proc_fdinfo_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) +static void timers_stop(struct seq_file *m, void *v) { - unsigned fd = *(unsigned *)ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); + struct timers_private *tp = m->private; - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; - inode->i_fop = &proc_fdinfo_file_operations; - dentry->d_op = &tid_fd_dentry_operations; - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) - error = NULL; + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } - out: - return error; + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } } -static struct dentry *proc_lookupfdinfo(struct inode *dir, - struct dentry *dentry, - struct nameidata *nd) +static int show_timer(struct seq_file *m, void *v) { - return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); -} + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static char *nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, - proc_fdinfo_instantiate); + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); + + return 0; } -static const struct file_operations proc_fdinfo_operations = { - .read = generic_read_dir, - .readdir = proc_readfdinfo, +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, }; -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fdinfo_inode_operations = { - .lookup = proc_lookupfdinfo, - .setattr = proc_setattr, -}; +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = inode->i_sb->s_fs_info; + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ -static struct dentry *proc_pident_instantiate(struct inode *dir, +static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) @@ -2045,19 +2120,19 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; /* Use getattr to fix if necessary */ + set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) inode->i_op = p->iop; if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; + if (pid_revalidate(dentry, 0)) + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2065,11 +2140,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - struct dentry *error; + int error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; - error = ERR_PTR(-ENOENT); + error = -ENOENT; if (!task) goto out_no_task; @@ -2092,77 +2167,40 @@ static struct dentry *proc_pident_lookup(struct inode *dir, out: put_task_struct(task); out_no_task: - return error; -} - -static int proc_pident_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_pident_instantiate, task, p); + return ERR_PTR(error); } -static int proc_pident_readdir(struct file *filp, - void *dirent, filldir_t filldir, +static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - const struct pid_entry *p, *last; - ino_t ino; - int ret; + struct task_struct *task = get_proc_task(file_inode(file)); + const struct pid_entry *p; - ret = -ENOENT; if (!task) - goto out_no_task; + return -ENOENT; - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - last = &ents[nents - 1]; - while (p <= last) { - if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) - goto out; - filp->f_pos++; - p++; - } - } + if (!dir_emit_dots(file, ctx)) + goto out; - ret = 1; + if (ctx->pos >= nents + 2) + goto out; + + for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + if (!proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) + break; + ctx->pos++; + } out: put_task_struct(task); -out_no_task: - return ret; + return 0; } #ifdef CONFIG_SECURITY static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *p = NULL; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2183,7 +2221,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; + struct inode * inode = file_inode(file); char *page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2209,14 +2247,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out_free; /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(&task->cred_guard_mutex); + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (length < 0) goto out_free; length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, (void*)page, count); - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out_free: free_page((unsigned long) page); out: @@ -2228,6 +2266,7 @@ out_no_task: static const struct file_operations proc_pid_attr_operations = { .read = proc_pid_attr_read, .write = proc_pid_attr_write, + .llseek = generic_file_llseek, }; static const struct pid_entry attr_dir_stuff[] = { @@ -2239,20 +2278,20 @@ static const struct pid_entry attr_dir_stuff[] = { REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), }; -static int proc_attr_dir_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); + return proc_pident_readdir(file, ctx, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, - .readdir = proc_attr_dir_readdir, + .iterate = proc_attr_dir_readdir, + .llseek = default_llseek, }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); @@ -2270,7 +2309,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = { static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; char buffer[PROC_NUMBUF]; size_t len; @@ -2322,7 +2361,7 @@ static ssize_t proc_coredump_filter_write(struct file *file, goto out_no_task; ret = -ESRCH; - task = get_proc_task(file->f_dentry->d_inode); + task = get_proc_task(file_inode(file)); if (!task) goto out_no_task; @@ -2348,162 +2387,25 @@ static ssize_t proc_coredump_filter_write(struct file *file, static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, }; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return ERR_PTR(-ENOENT); - sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); - return ERR_PTR(vfs_follow_link(nd,tmp)); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, -}; - -/* - * proc base - * - * These are the directory entries in the root directory of /proc - * that properly belong to the /proc filesystem, as they describe - * describe something that is process related. - */ -static const struct pid_entry proc_base_stuff[] = { - NOD("self", S_IFLNK|S_IRWXUGO, - &proc_self_inode_operations, NULL, {}), -}; - -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - if (task) { - put_task_struct(task); - return 1; - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations proc_base_dentry_operations = -{ - .d_revalidate = proc_base_revalidate, - .d_delete = pid_delete_dentry, -}; - -static struct dentry *proc_base_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct pid_entry *p = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-EINVAL); - - /* Allocate the inode */ - error = ERR_PTR(-ENOMEM); - inode = new_inode(dir->i_sb); - if (!inode) - goto out; - - /* Initialize the inode */ - ei = PROC_I(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - /* - * grab the reference to the task. - */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) - goto out_iput; - - inode->i_mode = p->mode; - if (S_ISDIR(inode->i_mode)) - inode->i_nlink = 2; - if (S_ISLNK(inode->i_mode)) - inode->i_size = 64; - if (p->iop) - inode->i_op = p->iop; - if (p->fop) - inode->i_fop = p->fop; - ei->op = p->op; - dentry->d_op = &proc_base_dentry_operations; - d_add(dentry, inode); - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - /* Lookup the directory entry */ - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; - for (p = proc_base_stuff; p <= last; p++) { - if (p->len != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) - break; - } - if (p > last) - goto out; - - error = proc_base_instantiate(dir, dentry, task, p); - -out: - put_task_struct(task); -out_no_task: - return error; -} - -static int proc_base_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_base_instantiate, task, p); -} - #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; + int result; + + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2514,7 +2416,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - return sprintf(buffer, + result = sprintf(buffer, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2529,6 +2431,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) @@ -2542,11 +2447,96 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static int proc_projid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_projid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_projid_map_operations = { + .open = proc_projid_map_open, + .write = proc_projid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; +#endif /* CONFIG_USER_NS */ + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - seq_printf(m, "%08x\n", task->personality); - return 0; + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; } /* @@ -2558,7 +2548,11 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif @@ -2566,10 +2560,13 @@ static const struct pid_entry tgid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), @@ -2577,9 +2574,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_maps_operations), + REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -2590,7 +2587,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_smaps_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -2615,7 +2612,8 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -2627,23 +2625,35 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), + INF("io", S_IRUSR, proc_tgid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), +#endif +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), #endif }; -static int proc_tgid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); + return proc_pident_readdir(file, ctx, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .readdir = proc_tgid_base_readdir, + .iterate = proc_tgid_base_readdir, + .llseek = default_llseek, }; -static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } @@ -2652,6 +2662,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -2662,6 +2673,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", pid); + /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { shrink_dcache_parent(dentry); @@ -2736,17 +2748,12 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } -static struct dentry *proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, - struct task_struct *task, const void *ptr) +static int proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -2758,30 +2765,26 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; + if (pid_revalidate(dentry, 0)) + return 0; out: - return error; + return -ENOENT; } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = ERR_PTR(-ENOENT); + int result = 0; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - result = proc_base_lookup(dir, dentry); - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) - goto out; - tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2798,7 +2801,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct result = proc_pid_instantiate(dir, dentry, task, NULL); put_task_struct(task); out: - return result; + return ERR_PTR(result); } /* @@ -2844,50 +2847,44 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) - -static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct tgid_iter iter) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", iter.tgid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_pid_instantiate, iter.task, NULL); -} +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) /* for the /proc/ directory itself, after non-process stuff has been done */ -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +int proc_pid_readdir(struct file *file, struct dir_context *ctx) { - unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); struct tgid_iter iter; - struct pid_namespace *ns; + struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info; + loff_t pos = ctx->pos; - if (!reaper) - goto out_no_task; + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) + return 0; - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { - const struct pid_entry *p = &proc_base_stuff[nr]; - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) - goto out; + if (pos == TGID_OFFSET - 1) { + struct inode *inode = ns->proc_self->d_inode; + if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + return 0; + iter.tgid = 0; + } else { + iter.tgid = pos - TGID_OFFSET; } - - ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; - iter.tgid = filp->f_pos - TGID_OFFSET; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + char name[PROC_NUMBUF]; + int len; + if (!has_pid_permissions(ns, iter.task, 2)) + continue; + + len = snprintf(name, sizeof(name), "%d", iter.tgid); + ctx->pos = iter.tgid + TGID_OFFSET; + if (!proc_fill_cache(file, ctx, name, len, + proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); - goto out; + return 0; } } - filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; -out: - put_task_struct(reaper); -out_no_task: + ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } @@ -2896,12 +2893,13 @@ out_no_task: */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -2912,9 +2910,12 @@ static const struct pid_entry tid_base_stuff[] = { INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_maps_operations), + REG("maps", S_IRUGO, proc_tid_maps_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("children", S_IRUGO, proc_tid_children_operations), +#endif #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -2924,7 +2925,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_smaps_operations), + REG("smaps", S_IRUGO, proc_tid_smaps_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -2949,34 +2950,44 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), - REG("sessionid", S_IRUSR, proc_sessionid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tid_io_accounting), + INF("io", S_IRUSR, proc_tid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif }; -static int proc_tid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); + return proc_pident_readdir(file, ctx, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } -static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, - .readdir = proc_tid_base_readdir, + .iterate = proc_tid_base_readdir, + .llseek = default_llseek, }; static const struct inode_operations proc_tid_base_inode_operations = { @@ -2985,10 +2996,9 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static struct dentry *proc_task_instantiate(struct inode *dir, +static int proc_task_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -2999,22 +3009,22 @@ static struct dentry *proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff)); + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; + if (pid_revalidate(dentry, 0)) + return 0; out: - return error; + return -ENOENT; } -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = ERR_PTR(-ENOENT); + int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; @@ -3044,7 +3054,7 @@ out_drop_task: out: put_task_struct(leader); out_no_task: - return result; + return ERR_PTR(result); } /* @@ -3059,34 +3069,42 @@ out_no_task: * In the case of a seek we start with the leader and walk nr * threads past it. */ -static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr, struct pid_namespace *ns) +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) { - struct task_struct *pos; + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; rcu_read_lock(); - /* Attempt to start with the pid of a thread */ - if (tid && (nr > 0)) { + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); - if (pos && (pos->group_leader == leader)) + if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ - pos = NULL; - if (nr && nr >= get_nr_threads(leader)) - goto out; + if (nr >= get_nr_threads(task)) + goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - for (pos = leader; nr > 0; --nr) { - pos = next_thread(pos); - if (pos == leader) { - pos = NULL; - goto out; - } - } + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; found: get_task_struct(pos); out: @@ -3116,78 +3134,44 @@ static struct task_struct *next_tid(struct task_struct *start) return pos; } -static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct task_struct *task, int tid) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", tid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_task_instantiate, task, NULL); -} - /* for the /proc/TGID/task/ directories */ -static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *leader = NULL; + struct inode *inode = file_inode(file); struct task_struct *task; - int retval = -ENOENT; - ino_t ino; - int tid; struct pid_namespace *ns; + int tid; - task = get_proc_task(inode); - if (!task) - goto out_no_task; - rcu_read_lock(); - if (pid_alive(task)) { - leader = task->group_leader; - get_task_struct(leader); - } - rcu_read_unlock(); - put_task_struct(task); - if (!leader) - goto out_no_task; - retval = 0; + if (proc_inode_is_dead(inode)) + return -ENOENT; - switch ((unsigned long)filp->f_pos) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - } + if (!dir_emit_dots(file, ctx)) + return 0; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = filp->f_dentry->d_sb->s_fs_info; - tid = (int)filp->f_version; - filp->f_version = 0; - for (task = first_tid(leader, tid, filp->f_pos - 2, ns); + ns = file->f_dentry->d_sb->s_fs_info; + tid = (int)file->f_version; + file->f_version = 0; + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; - task = next_tid(task), filp->f_pos++) { + task = next_tid(task), ctx->pos++) { + char name[PROC_NUMBUF]; + int len; tid = task_pid_nr_ns(task, ns); - if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { + len = snprintf(name, sizeof(name), "%d", tid); + if (!proc_fill_cache(file, ctx, name, len, + proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - filp->f_version = (u64)tid; + file->f_version = (u64)tid; put_task_struct(task); break; } } -out: - put_task_struct(leader); -out_no_task: - return retval; + + return 0; } static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -3208,9 +3192,11 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { .read = generic_read_dir, - .readdir = proc_task_readdir, + .iterate = proc_task_readdir, + .llseek = default_llseek, }; diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 82676e3fcd1..cbd82dff7e8 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void) proc_create("cmdline", 0, NULL, &cmdline_proc_fops); return 0; } -module_init(proc_cmdline_init); +fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c new file mode 100644 index 00000000000..290ba85cb90 --- /dev/null +++ b/fs/proc/consoles.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + * + * Licensed under GPLv2 + */ + +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/tty_driver.h> + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_setwidth(m, 21 - 1); + seq_printf(m, "%s%d", con->name, con->index); + seq_pad(m, ' '); + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_printf(m, "\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int consoles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &consoles_op); +} + +static const struct file_operations proc_consoles_operations = { + .open = consoles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_consoles_init(void) +{ + proc_create("consoles", 0, NULL, &proc_consoles_operations); + return 0; +} +fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 5a1e539a234..06f4d31e039 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void) proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); return 0; } -module_init(proc_cpuinfo_init); +fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 59ee7da959c..50493edc30e 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v) if (i < CHRDEV_MAJOR_HASH_SIZE) { if (i == 0) - seq_printf(f, "Character devices:\n"); + seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { i -= CHRDEV_MAJOR_HASH_SIZE; if (i == 0) - seq_printf(f, "\nBlock devices:\n"); + seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); } #endif @@ -67,4 +67,4 @@ static int __init proc_devices_init(void) proc_create("devices", 0, NULL, &proc_devinfo_operations); return 0; } -module_init(proc_devices_init); +fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c new file mode 100644 index 00000000000..0788d093f5d --- /dev/null +++ b/fs/proc/fd.c @@ -0,0 +1,351 @@ +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/dcache.h> +#include <linux/path.h> +#include <linux/fdtable.h> +#include <linux/namei.h> +#include <linux/pid.h> +#include <linux/security.h> +#include <linux/file.h> +#include <linux/seq_file.h> + +#include <linux/proc_fs.h> + +#include "../mount.h" +#include "internal.h" +#include "fd.h" + +static int seq_show(struct seq_file *m, void *v) +{ + struct files_struct *files = NULL; + int f_flags = 0, ret = -ENOENT; + struct file *file = NULL; + struct task_struct *task; + + task = get_proc_task(m->private); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + + if (files) { + int fd = proc_fd(m->private); + + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + struct fdtable *fdt = files_fdtable(files); + + f_flags = file->f_flags; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + get_file(file); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + if (!ret) { + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); + if (file->f_op->show_fdinfo) + ret = file->f_op->show_fdinfo(m, file); + fput(file); + } + + return ret; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = seq_fdinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct files_struct *files; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int fd; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + struct file *file; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + unsigned f_mode = file->f_mode; + + rcu_read_unlock(); + put_files_struct(files); + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + + d_drop(dentry); + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + struct files_struct *files = NULL; + struct task_struct *task; + int ret = -ENOENT; + + task = get_proc_task(dentry->d_inode); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + + if (files) { + int fd = proc_fd(dentry->d_inode); + struct file *fd_file; + + spin_lock(&files->file_lock); + fd_file = fcheck_files(files, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + return ret; +} + +static int +proc_fd_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFLNK; + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + + ei->op.proc_get_link = proc_fd_link; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + return 0; + out: + return -ENOENT; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + int result = -ENOENT; + unsigned fd = name_to_int(dentry); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); +out: + put_task_struct(task); +out_no_task: + return ERR_PTR(result); +} + +static int proc_readfd_common(struct file *file, struct dir_context *ctx, + instantiate_t instantiate) +{ + struct task_struct *p = get_proc_task(file_inode(file)); + struct files_struct *files; + unsigned int fd; + + if (!p) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + files = get_files_struct(p); + if (!files) + goto out; + + rcu_read_lock(); + for (fd = ctx->pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, ctx->pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (!proc_fill_cache(file, ctx, + name, len, instantiate, p, + (void *)(unsigned long)fd)) + goto out_fd_loop; + rcu_read_lock(); + } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); +out: + put_task_struct(p); + return 0; +} + +static int proc_readfd(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .iterate = proc_readfd, + .llseek = default_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct inode *inode, int mask) +{ + int rv = generic_permission(inode, mask); + if (rv == 0) + return 0; + if (task_tgid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + return 0; + out: + return -ENOENT; +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, + proc_fdinfo_instantiate); +} + +const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .iterate = proc_readfdinfo, + .llseek = default_llseek, +}; diff --git a/fs/proc/fd.h b/fs/proc/fd.h new file mode 100644 index 00000000000..7c047f256ae --- /dev/null +++ b/fs/proc/fd.h @@ -0,0 +1,19 @@ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include <linux/fs.h> + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct inode *inode, int mask); + +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + +#endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 480cb1065ee..b7f268eb5f4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -12,7 +12,10 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> +#include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> +#include <linux/printk.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> @@ -26,229 +29,13 @@ DEFINE_SPINLOCK(proc_subdir_lock); -static int proc_match(int len, const char *name, struct proc_dir_entry *de) +static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { if (de->namelen != len) return 0; return !memcmp(name, de->name, len); } -/* buffer size is one page but our output routines use some slack for overruns */ -#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) - -static ssize_t -__proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct inode * inode = file->f_path.dentry->d_inode; - char *page; - ssize_t retval=0; - int eof=0; - ssize_t n, count; - char *start; - struct proc_dir_entry * dp; - unsigned long long pos; - - /* - * Gaah, please just use "seq_file" instead. The legacy /proc - * interfaces cut loff_t down to off_t for reads, and ignore - * the offset entirely for writes.. - */ - pos = *ppos; - if (pos > MAX_NON_LFS) - return 0; - if (nbytes > MAX_NON_LFS - pos) - nbytes = MAX_NON_LFS - pos; - - dp = PDE(inode); - if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - while ((nbytes > 0) && !eof) { - count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); - - start = NULL; - if (dp->read_proc) { - /* - * How to be a proc read function - * ------------------------------ - * Prototype: - * int f(char *buffer, char **start, off_t offset, - * int count, int *peof, void *dat) - * - * Assume that the buffer is "count" bytes in size. - * - * If you know you have supplied all the data you - * have, set *peof. - * - * You have three ways to return data: - * 0) Leave *start = NULL. (This is the default.) - * Put the data of the requested offset at that - * offset within the buffer. Return the number (n) - * of bytes there are from the beginning of the - * buffer up to the last byte of data. If the - * number of supplied bytes (= n - offset) is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by the number of bytes - * absorbed. This interface is useful for files - * no larger than the buffer. - * 1) Set *start = an unsigned long value less than - * the buffer address but greater than zero. - * Put the data of the requested offset at the - * beginning of the buffer. Return the number of - * bytes of data placed there. If this number is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by *start. This interface is - * useful when you have a large file consisting - * of a series of blocks which you want to count - * and return as wholes. - * (Hack by Paul.Russell@rustcorp.com.au) - * 2) Set *start = an address within the buffer. - * Put the data of the requested offset at *start. - * Return the number of bytes of data placed there. - * If this number is greater than zero and you - * didn't signal eof and the reader is prepared to - * take more data you will be called again with the - * requested offset advanced by the number of bytes - * absorbed. - */ - n = dp->read_proc(page, &start, *ppos, - count, &eof, dp->data); - } else - break; - - if (n == 0) /* end of file */ - break; - if (n < 0) { /* error */ - if (retval == 0) - retval = n; - break; - } - - if (start == NULL) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); - n = PAGE_SIZE; - } - n -= *ppos; - if (n <= 0) - break; - if (n > count) - n = count; - start = page + *ppos; - } else if (start < page) { - if (n > PAGE_SIZE) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); - n = PAGE_SIZE; - } - if (n > count) { - /* - * Don't reduce n because doing so might - * cut off part of a data block. - */ - printk(KERN_WARNING - "proc_file_read: Read count exceeded\n"); - } - } else /* start >= page */ { - unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) { - printk(KERN_ERR - "proc_file_read: Apparent buffer overflow!\n"); - n = PAGE_SIZE - startoff; - } - if (n > count) - n = count; - } - - n -= copy_to_user(buf, start < page ? page : start, n); - if (n == 0) { - if (retval == 0) - retval = -EFAULT; - break; - } - - *ppos += start < page ? (unsigned long)start : n; - nbytes -= n; - buf += n; - retval += n; - } - free_page((unsigned long) page); - return retval; -} - -static ssize_t -proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - ssize_t rv = -EIO; - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - rv = __proc_file_read(file, buf, nbytes, ppos); - - pde_users_dec(pde); - return rv; -} - -static ssize_t -proc_file_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - ssize_t rv = -EIO; - - if (pde->write_proc) { - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - /* FIXME: does this routine need ppos? probably... */ - rv = pde->write_proc(file, buffer, count, pde->data); - pde_users_dec(pde); - } - return rv; -} - - -static loff_t -proc_file_lseek(struct file *file, loff_t offset, int orig) -{ - loff_t retval = -EINVAL; - switch (orig) { - case 1: - offset += file->f_pos; - /* fallthrough */ - case 0: - if (offset < 0 || offset > MAX_NON_LFS) - break; - file->f_pos = retval = offset; - } - return retval; -} - -static const struct file_operations proc_file_operations = { - .llseek = proc_file_lseek, - .read = proc_file_read, - .write = proc_file_write, -}; - static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; @@ -257,17 +44,14 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) error = inode_change_ok(inode, iattr); if (error) - goto out; + return error; - error = inode_setattr(inode, iattr); - if (error) - goto out; - - de->uid = inode->i_uid; - de->gid = inode->i_gid; + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; -out: - return error; + return 0; } static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -276,7 +60,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct proc_dir_entry *de = PROC_I(inode)->pde; if (de && de->nlink) - inode->i_nlink = de->nlink; + set_nlink(inode, de->nlink); generic_fillattr(inode, stat); return 0; @@ -291,19 +75,17 @@ static const struct inode_operations proc_file_inode_operations = { * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, - struct proc_dir_entry **ret, const char **residual) +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) { const char *cp = name, *next; struct proc_dir_entry *de; - int len; - int rtn = 0; + unsigned int len; de = *ret; if (!de) de = &proc_root; - spin_lock(&proc_subdir_lock); while (1) { next = strchr(cp, '/'); if (!next) @@ -315,16 +97,25 @@ static int xlate_proc_name(const char *name, break; } if (!de) { - rtn = -ENOENT; - goto out; + WARN(1, "name '%s'\n", name); + return -ENOENT; } cp += len + 1; } *residual = cp; *ret = de; -out: + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); spin_unlock(&proc_subdir_lock); - return rtn; + return rv; } static DEFINE_IDA(proc_inum_ida); @@ -335,58 +126,45 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ /* * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. - * - * Current inode allocations in the proc-fs (hex-numbers): - * - * 00000000 reserved - * 00000001-00000fff static entries (goners) - * 001 root-ino - * - * 00001000-00001fff unused - * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff - * 80000000-efffffff unused - * f0000000-ffffffff dynamic entries - * - * Goal: - * Once we split the thing into several virtual filesystems, - * we will get rid of magical ranges (and this comment, BTW). */ -static unsigned int get_inode_number(void) +int proc_alloc_inum(unsigned int *inum) { unsigned int i; int error; retry: - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) - return 0; + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; - spin_lock(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) - return 0; + return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock(&proc_inum_lock); - return 0; + spin_unlock_irq(&proc_inum_lock); + return -ENOSPC; } - return PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + i; + return 0; } -static void release_inode_number(unsigned int inum) +void proc_free_inum(unsigned int inum) { - spin_lock(&proc_inum_lock); + unsigned long flags; + spin_lock_irqsave(&proc_inum_lock, flags); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock(&proc_inum_lock); + spin_unlock_irqrestore(&proc_inum_lock, flags); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, PDE(dentry->d_inode)->data); + nd_set_link(nd, __PDE_DATA(dentry->d_inode)); return NULL; } @@ -396,61 +174,35 @@ static const struct inode_operations proc_link_inode_operations = { }; /* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - -/* * Don't create negative dentries here, return -ENOENT by hand * instead. */ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *dentry) { - struct inode *inode = NULL; - int error = -ENOENT; + struct inode *inode; spin_lock(&proc_subdir_lock); for (de = de->subdir; de ; de = de->next) { if (de->namelen != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino; - - ino = de->low_ino; pde_get(de); spin_unlock(&proc_subdir_lock); - error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); - goto out_unlock; + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, inode); + return NULL; } } spin_unlock(&proc_subdir_lock); -out_unlock: - - if (inode) { - dentry->d_op = &proc_dentry_operations; - d_add(dentry, inode); - return NULL; - } - if (de) - pde_put(de); - return ERR_PTR(error); + return ERR_PTR(-ENOENT); } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookup_de(PDE(dir), dir, dentry); } @@ -464,76 +216,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir) +int proc_readdir_de(struct proc_dir_entry *de, struct file *file, + struct dir_context *ctx) { - unsigned int ino; int i; - struct inode *inode = filp->f_path.dentry->d_inode; - int ret = 0; - - ino = inode->i_ino; - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, i, - parent_ino(filp->f_path.dentry), - DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - spin_lock(&proc_subdir_lock); - de = de->subdir; - i -= 2; - for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } - if (!i) - break; - de = de->next; - i--; - } - do { - struct proc_dir_entry *next; - - /* filldir passes info to user space */ - pde_get(de); - spin_unlock(&proc_subdir_lock); - if (filldir(dirent, de->name, de->namelen, filp->f_pos, - de->low_ino, de->mode >> 12) < 0) { - pde_put(de); - goto out; - } - spin_lock(&proc_subdir_lock); - filp->f_pos++; - next = de->next; - pde_put(de); - de = next; - } while (de); + if (!dir_emit_dots(file, ctx)) + return 0; + + spin_lock(&proc_subdir_lock); + de = de->subdir; + i = ctx->pos - 2; + for (;;) { + if (!de) { spin_unlock(&proc_subdir_lock); + return 0; + } + if (!i) + break; + de = de->next; + i--; } - ret = 1; -out: - return ret; + + do { + struct proc_dir_entry *next; + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (!dir_emit(ctx, de->name, de->namelen, + de->low_ino, de->mode >> 12)) { + pde_put(de); + return 0; + } + spin_lock(&proc_subdir_lock); + ctx->pos++; + next = de->next; + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + return 1; } -int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +int proc_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), filp, dirent, filldir); + return proc_readdir_de(PDE(inode), file, ctx); } /* @@ -544,7 +272,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) static const struct file_operations proc_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_readdir, + .iterate = proc_readdir, }; /* @@ -558,35 +286,32 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - unsigned int i; struct proc_dir_entry *tmp; + int ret; - i = get_inode_number(); - if (i == 0) - return -EAGAIN; - dp->low_ino = i; + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; if (S_ISDIR(dp->mode)) { - if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - } + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; dir->nlink++; } else if (S_ISLNK(dp->mode)) { - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_link_inode_operations; + dp->proc_iops = &proc_link_inode_operations; } else if (S_ISREG(dp->mode)) { - if (dp->proc_fops == NULL) - dp->proc_fops = &proc_file_operations; - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_file_inode_operations; + BUG_ON(dp->proc_fops == NULL); + dp->proc_iops = &proc_file_inode_operations; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&proc_subdir_lock); for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); break; } @@ -601,15 +326,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, const char *name, - mode_t mode, + umode_t mode, nlink_t nlink) { struct proc_dir_entry *ent = NULL; const char *fn = name; - int len; + unsigned int len; /* make sure name is valid */ - if (!name || !strlen(name)) goto out; + if (!name || !strlen(name)) + goto out; if (xlate_proc_name(name, parent, &fn) != 0) goto out; @@ -620,21 +346,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, len = strlen(fn); - ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); - if (!ent) goto out; + ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + if (!ent) + goto out; - memset(ent, 0, sizeof(struct proc_dir_entry)); - memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); - ent->name = ((char *) ent) + sizeof(*ent); + memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); - ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); - ent->pde_unload_completion = NULL; INIT_LIST_HEAD(&ent->pde_openers); - out: +out: return ent; } @@ -662,14 +385,19 @@ struct proc_dir_entry *proc_symlink(const char *name, } return ent; } +EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) { struct proc_dir_entry *ent; + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { + ent->data = data; if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; @@ -677,79 +405,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, } return ent; } +EXPORT_SYMBOL_GPL(proc_mkdir_data); -struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) { - struct proc_dir_entry *ent; - - ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); - if (ent) { - ent->data = net; - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; + return proc_mkdir_data(name, mode, parent, NULL); } -EXPORT_SYMBOL_GPL(proc_net_mkdir); +EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { - return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); + return proc_mkdir_data(name, 0, parent, NULL); } +EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *ent; - nlink_t nlink; - - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; - } - - ent = __proc_create(&parent, name, mode, nlink); - if (ent) { - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; -} - -struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) { struct proc_dir_entry *pde; - nlink_t nlink; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; + if (!S_ISREG(mode)) { + WARN_ON(1); /* use proc_mkdir() */ + return NULL; } - pde = __proc_create(&parent, name, mode, nlink); + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + pde = __proc_create(&parent, name, mode, 1); if (!pde) goto out; pde->proc_fops = proc_fops; @@ -762,15 +450,24 @@ out_free: out: return NULL; } - -static void free_proc_entry(struct proc_dir_entry *de) +EXPORT_SYMBOL(proc_create_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) { - unsigned int ino = de->low_ino; + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); - if (ino < PROC_DYNAMIC_FIRST) - return; +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); - release_inode_number(ino); +static void free_proc_entry(struct proc_dir_entry *de) +{ + proc_free_inum(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); @@ -791,13 +488,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) struct proc_dir_entry **p; struct proc_dir_entry *de = NULL; const char *fn = name; - int len; + unsigned int len; - if (xlate_proc_name(name, &parent, &fn) != 0) + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); return; + } len = strlen(fn); - spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (proc_match(len, fn, *p)) { de = *p; @@ -807,49 +506,93 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } } spin_unlock(&proc_subdir_lock); - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return; + } - spin_lock(&de->pde_unload_lock); - /* - * Stop accepting new callers into module. If you're - * dynamically allocating ->proc_fops, save a pointer somewhere. - */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); + proc_entry_rundown(de); - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; + if (S_ISDIR(de->mode)) + parent->nlink--; + de->nlink = 0; + WARN(de->subdir, "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); + pde_put(de); +} +EXPORT_SYMBOL(remove_proc_entry); - spin_unlock(&de->pde_unload_lock); +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; - wait_for_completion(de->pde_unload_completion); + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); - goto continue_removing; + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + root = *p; + *p = root->next; + root->next = NULL; + break; + } } - spin_unlock(&de->pde_unload_lock); - -continue_removing: - spin_lock(&de->pde_unload_lock); - while (!list_empty(&de->pde_openers)) { - struct pde_opener *pdeo; - - pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); - list_del(&pdeo->lh); - spin_unlock(&de->pde_unload_lock); - pdeo->release(pdeo->inode, pdeo->file); - kfree(pdeo); - spin_lock(&de->pde_unload_lock); + if (!root) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; } - spin_unlock(&de->pde_unload_lock); + de = root; + while (1) { + next = de->subdir; + if (next) { + de->subdir = next->next; + next->next = NULL; + de = next; + continue; + } + spin_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + de->nlink = 0; + if (de == root) + break; + pde_put(de); - if (S_ISDIR(de->mode)) - parent->nlink--; - de->nlink = 0; - WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); - pde_put(de); + spin_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +void *PDE_DATA(const struct inode *inode) +{ + return __PDE_DATA(inode); } +EXPORT_SYMBOL(PDE_DATA); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 445a02bcaab..0adbc02d60e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,28 +7,36 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/completion.h> #include <linux/poll.h> +#include <linux/printk.h> #include <linux/file.h> #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sysctl.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/magic.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "internal.h" -static void proc_delete_inode(struct inode *inode) +static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; + struct ctl_table_header *head; + const struct proc_ns_operations *ns_ops; + void *ns; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); /* Stop tracking associated processes */ put_pid(PROC_I(inode)->pid); @@ -37,13 +45,18 @@ static void proc_delete_inode(struct inode *inode) de = PROC_I(inode)->pde; if (de) pde_put(de); - if (PROC_I(inode)->sysctl) - sysctl_head_put(PROC_I(inode)->sysctl); - clear_inode(inode); + head = PROC_I(inode)->sysctl; + if (head) { + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } + /* Release any associated namespace */ + ns_ops = PROC_I(inode)->ns.ns_ops; + ns = PROC_I(inode)->ns.ns; + if (ns_ops && ns) + ns_ops->put(ns); } -struct vfsmount *proc_mnt; - static struct kmem_cache * proc_inode_cachep; static struct inode *proc_alloc_inode(struct super_block *sb) @@ -60,16 +73,24 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + ei->ns.ns = NULL; + ei->ns.ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; } -static void proc_destroy_inode(struct inode *inode) +static void proc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; @@ -86,203 +107,207 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, - .delete_inode = proc_delete_inode, + .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; -static void __pde_users_dec(struct proc_dir_entry *pde) +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) { - pde->pde_users--; - if (pde->pde_unload_completion && pde->pde_users == 0) - complete(pde->pde_unload_completion); + return atomic_inc_unless_negative(&pde->in_use); } -void pde_users_dec(struct proc_dir_entry *pde) +static void unuse_pde(struct proc_dir_entry *pde) { - spin_lock(&pde->pde_unload_lock); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + if (atomic_dec_return(&pde->in_use) == BIAS) + complete(pde->pde_unload_completion); } -static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +/* pde is locked */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - loff_t rv = -EINVAL; - loff_t (*llseek)(struct file *, loff_t, int); - - spin_lock(&pde->pde_unload_lock); - /* - * remove_proc_entry() is going to delete PDE (as part of module - * cleanup sequence). No new callers into module allowed. - */ - if (!pde->proc_fops) { + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); - return rv; + wait_for_completion(&c); + spin_lock(&pde->pde_unload_lock); + } else { + struct file *file; + pdeo->closing = 1; + spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; + pde->proc_fops->release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); + list_del_init(&pdeo->lh); + if (pdeo->c) + complete(pdeo->c); + kfree(pdeo); } - /* - * Bump refcount so that remove_proc_entry will wail for ->llseek to - * complete. - */ - pde->pde_users++; - /* - * Save function pointer under lock, to protect against ->proc_fops - * NULL'ifying right after ->pde_unload_lock is dropped. - */ - llseek = pde->proc_fops->llseek; - spin_unlock(&pde->pde_unload_lock); +} - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + } + spin_unlock(&de->pde_unload_lock); +} - pde_users_dec(pde); +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + loff_t rv = -EINVAL; + if (use_pde(pde)) { + loff_t (*llseek)(struct file *, loff_t, int); + llseek = pde->proc_fops->llseek; + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + unuse_pde(pde); + } return rv; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - ssize_t rv = -EIO; ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + if (use_pde(pde)) { + read = pde->proc_fops->read; + if (read) + rv = read(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - read = pde->proc_fops->read; - spin_unlock(&pde->pde_unload_lock); - - if (read) - rv = read(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - ssize_t rv = -EIO; ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + if (use_pde(pde)) { + write = pde->proc_fops->write; + if (write) + rv = write(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - write = pde->proc_fops->write; - spin_unlock(&pde->pde_unload_lock); - - if (write) - rv = write(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + poll = pde->proc_fops->poll; + if (poll) + rv = poll(file, pts); + unuse_pde(pde); } - pde->pde_users++; - poll = pde->proc_fops->poll; - spin_unlock(&pde->pde_unload_lock); - - if (poll) - rv = poll(file, pts); - - pde_users_dec(pde); return rv; } static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long); - int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - unlocked_ioctl = pde->proc_fops->unlocked_ioctl; - ioctl = pde->proc_fops->ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (unlocked_ioctl) { - rv = unlocked_ioctl(file, cmd, arg); - if (rv == -ENOIOCTLCMD) - rv = -EINVAL; - } else if (ioctl) { - lock_kernel(); - rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg); - unlock_kernel(); + long (*ioctl)(struct file *, unsigned int, unsigned long); + if (use_pde(pde)) { + ioctl = pde->proc_fops->unlocked_ioctl; + if (ioctl) + rv = ioctl(file, cmd, arg); + unuse_pde(pde); } - - pde_users_dec(pde); return rv; } #ifdef CONFIG_COMPAT static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + compat_ioctl = pde->proc_fops->compat_ioctl; + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - compat_ioctl = pde->proc_fops->compat_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } #endif static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { - struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + mmap = pde->proc_fops->mmap; + if (mmap) + rv = mmap(file, vma); + unuse_pde(pde); } - pde->pde_users++; - mmap = pde->proc_fops->mmap; - spin_unlock(&pde->pde_unload_lock); + return rv; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; - if (mmap) - rv = mmap(file, vma); + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; - pde_users_dec(pde); + get_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + if (!get_area) + get_area = current->mm->get_unmapped_area; +#endif + + if (get_area) + rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; + unuse_pde(pde); + } return rv; } @@ -304,91 +329,47 @@ static int proc_reg_open(struct inode *inode, struct file *file) * by hand in remove_proc_entry(). For this, save opener's credentials * for later. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + if (!use_pde(pde)) { kfree(pdeo); - return -EINVAL; + return -ENOENT; } - pde->pde_users++; open = pde->proc_fops->open; release = pde->proc_fops->release; - spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - spin_lock(&pde->pde_unload_lock); if (rv == 0 && release) { /* To know what to release. */ - pdeo->inode = inode; pdeo->file = file; /* Strictly for "too late" ->release in proc_reg_release(). */ - pdeo->release = release; + spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); } else kfree(pdeo); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); - return rv; -} -static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, - struct inode *inode, struct file *file) -{ - struct pde_opener *pdeo; - - list_for_each_entry(pdeo, &pde->pde_openers, lh) { - if (pdeo->inode == inode && pdeo->file == file) - return pdeo; - } - return NULL; + unuse_pde(pde); + return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - int rv = 0; - int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; - spin_lock(&pde->pde_unload_lock); - pdeo = find_pde_opener(pde, inode, file); - if (!pde->proc_fops) { - /* - * Can't simply exit, __fput() will think that everything is OK, - * and move on to freeing struct file. remove_proc_entry() will - * find slacker in opener's list and will try to do non-trivial - * things with struct file. Therefore, remove opener from list. - * - * But if opener is removed from list, who will ->release it? - */ - if (pdeo) { - list_del(&pdeo->lh); - spin_unlock(&pde->pde_unload_lock); - rv = pdeo->release(inode, file); - kfree(pdeo); - } else - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - release = pde->proc_fops->release; - if (pdeo) { - list_del(&pdeo->lh); - kfree(pdeo); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + break; + } } spin_unlock(&pde->pde_unload_lock); - - if (release) - rv = release(inode, file); - - pde_users_dec(pde); - return rv; + return 0; } static const struct file_operations proc_reg_file_ops = { @@ -401,6 +382,7 @@ static const struct file_operations proc_reg_file_ops = { .compat_ioctl = proc_reg_compat_ioctl, #endif .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; @@ -413,22 +395,19 @@ static const struct file_operations proc_reg_file_ops_no_compat = { .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; #endif -struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, - struct proc_dir_entry *de) +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode * inode; + struct inode *inode = new_inode_pseudo(sb); - inode = iget_locked(sb, ino); - if (!inode) - return NULL; - if (inode->i_state & I_NEW) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; if (de->mode) { @@ -439,9 +418,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, if (de->size) inode->i_size = de->size; if (de->nlink) - inode->i_nlink = de->nlink; - if (de->proc_iops) - inode->i_op = de->proc_iops; + set_nlink(inode, de->nlink); + WARN_ON(!de->proc_iops); + inode->i_op = de->proc_iops; if (de->proc_fops) { if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT @@ -455,15 +434,14 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; -} +} int proc_fill_super(struct super_block *s) { - struct inode * root_inode; + struct inode *root_inode; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; @@ -473,19 +451,17 @@ int proc_fill_super(struct super_block *s) s->s_time_gran = 1; pde_get(&proc_root); - root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); - if (!root_inode) - goto out_no_root; - root_inode->i_uid = 0; - root_inode->i_gid = 0; - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) - goto out_no_root; - return 0; + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } -out_no_root: - printk("proc_read_super: get root inode failed\n"); - iput(root_inode); - pde_put(&proc_root); - return -ENOMEM; + return proc_setup_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 1f24a3eddd1..3ab6d14e71c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -1,4 +1,4 @@ -/* internal.h: internal procfs definitions +/* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -10,58 +10,82 @@ */ #include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/binfmts.h> -extern struct proc_dir_entry proc_root; -#ifdef CONFIG_PROC_SYSCTL -extern int proc_sys_init(void); -#else -static inline void proc_sys_init(void) { } -#endif -#ifdef CONFIG_NET -extern int proc_net_init(void); -#else -static inline int proc_net_init(void) { return 0; } -#endif +struct ctl_table_header; +struct mempolicy; -struct vmalloc_info { - unsigned long used; - unsigned long largest_chunk; +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * The "next" pointer creates a linked list of one /proc directory, + * while parent/subdir create the directory structure (every + * /proc file has a parent, but "subdir" is NULL for all + * non-directory entries). + */ +struct proc_dir_entry { + unsigned int low_ino; + umode_t mode; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + struct proc_dir_entry *next, *parent, *subdir; + void *data; + atomic_t count; /* use count */ + atomic_t in_use; /* number of callers into module in progress; */ + /* negative -> it's going away RSN */ + struct completion *pde_unload_completion; + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + u8 namelen; + char name[]; }; -extern struct mm_struct *mm_for_maps(struct task_struct *); +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); +}; -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -extern void get_vmalloc_info(struct vmalloc_info *vmi); -#else +struct proc_inode { + struct pid *pid; + int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct proc_ns ns; + struct inode vfs_inode; +}; -#define VMALLOC_TOTAL 0UL -#define get_vmalloc_info(vmi) \ -do { \ - (vmi)->used = 0; \ - (vmi)->largest_chunk = 0; \ -} while(0) -#endif +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} -extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); - -extern const struct file_operations proc_maps_operations; -extern const struct file_operations proc_numa_maps_operations; -extern const struct file_operations proc_smaps_operations; -extern const struct file_operations proc_clear_refs_operations; -extern const struct file_operations proc_pagemap_operations; -extern const struct file_operations proc_net_operations; -extern const struct inode_operations proc_net_inode_operations; +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} -void proc_init_inodecache(void); +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} static inline struct pid *proc_pid(struct inode *inode) { @@ -73,49 +97,195 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_fd(struct inode *inode) +static inline int task_dumpable(struct task_struct *task) { - return PROC_I(inode)->fd; + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if (dumpable == SUID_DUMP_USER) + return 1; + return 0; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, - struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +static inline unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; -struct pde_opener { - struct inode *inode; - struct file *file; - int (*release)(struct inode *, struct file *); - struct list_head lh; -}; -void pde_users_dec(struct proc_dir_entry *pde); + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 + +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern int pid_revalidate(struct dentry *, unsigned int); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef int instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -unsigned long task_vsize(struct mm_struct *); -int task_statm(struct mm_struct *, int *, int *, int *, int *); -void task_mem(struct seq_file *, struct mm_struct *); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, + struct dentry *); +extern int proc_readdir(struct file *, struct dir_context *); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { atomic_inc(&pde->count); return pde; } -void pde_put(struct proc_dir_entry *pde); +extern void pde_put(struct proc_dir_entry *); -extern struct vfsmount *proc_mnt; -int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); +/* + * inode.c + */ +struct pde_opener { + struct file *file; + struct list_head lh; + int closing; + struct completion *c; +}; + +extern const struct inode_operations proc_pid_link_inode_operations; + +extern void proc_init_inodecache(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern int proc_fill_super(struct super_block *); +extern void proc_entry_rundown(struct proc_dir_entry *); /* - * These are generic /proc routines that use the internal - * "struct proc_dir_entry" tree to traverse the filesystem. - * - * The /proc root directory has extended versions to take care - * of the /proc/<pid> subdirectories. + * proc_namespaces.c + */ +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_sysctl.c */ -int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); +extern int proc_remount(struct super_block *, int *, char *); + +/* + * task_[no]mmu.c + */ +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +}; + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 05029c0e2f2..a352d5703b4 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void) proc_create("interrupts", 0, NULL, &proc_interrupts_operations); return 0; } -module_init(proc_interrupts_init); +fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index a44a7897fd4..39e6ef32f0b 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -11,20 +11,25 @@ #include <linux/mm.h> #include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> #include <linux/memory.h> #include <asm/sections.h> +#include "internal.h" #define CORE_STR "CORE" @@ -156,7 +161,8 @@ static int kcore_update_ram(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* calculate vmemmap's address from given system ram pfn and register it */ -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; unsigned long nr_pages = ent->size >> PAGE_SHIFT; @@ -188,7 +194,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) } #else -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { return 1; } @@ -246,10 +253,9 @@ static int kcore_update_ram(void) /* Not inialized....update now */ /* find out "max pfn" */ end_pfn = 0; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long node_end; - node_end = NODE_DATA(nid)->node_start_pfn + - NODE_DATA(nid)->node_spanned_pages; + node_end = node_end_pfn(nid); if (end_pfn < node_end) end_pfn = node_end; } @@ -401,7 +407,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) prpsinfo.pr_zomb = 0; strcpy(prpsinfo.pr_fname, "vmlinux"); - strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); nhdr->p_filesz += notesize(¬es[1]); bufp = storenote(¬es[1], bufp); @@ -490,7 +496,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } read_unlock(&kclist_lock); - if (m == NULL) { + if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) return -EFAULT; } else if (is_vmalloc_or_module_addr((void *)start)) { @@ -512,7 +518,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) n = copy_to_user(buffer, (char *)start, tsz); /* - * We cannot distingush between fault on source + * We cannot distinguish between fault on source * and fault on destination. When this happens * we clear too and hope it will trigger the * EFAULT again. @@ -557,9 +563,9 @@ static int open_kcore(struct inode *inode, struct file *filp) static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .llseek = default_llseek, }; -#ifdef CONFIG_MEMORY_HOTPLUG /* just remember that we have to update kcore */ static int __meminit kcore_callback(struct notifier_block *self, unsigned long action, void *arg) @@ -573,8 +579,11 @@ static int __meminit kcore_callback(struct notifier_block *self, } return NOTIFY_OK; } -#endif +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; static struct kcore_list kcore_vmalloc; @@ -586,7 +595,7 @@ static struct kcore_list kcore_text; */ static void __init proc_kcore_text_init(void) { - kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); } #else static void __init proc_kcore_text_init(void) @@ -615,7 +624,7 @@ static int __init proc_kcore_init(void) proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); if (!proc_root_kcore) { - printk(KERN_ERR "couldn't create /proc/kcore\n"); + pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ } /* Store text area if it's special */ @@ -626,8 +635,8 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + register_hotmemory_notifier(&kcore_callback_nb); return 0; } -module_init(proc_kcore_init); +fs_initcall(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 7ca78346d3f..05f8dcdb086 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -12,37 +12,37 @@ #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/fs.h> +#include <linux/syslog.h> #include <asm/uaccess.h> #include <asm/io.h> extern wait_queue_head_t log_wait; -extern int do_syslog(int type, char __user *bug, int count); - static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1,NULL,0); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0,NULL,0); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(2, buf, count); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } @@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = { .poll = kmsg_poll, .open = kmsg_open, .release = kmsg_release, + .llseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) @@ -60,4 +61,4 @@ static int __init proc_kmsg_init(void) proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); return 0; } -module_init(proc_kmsg_init); +fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 1afa4dd4cae..aec66e6c206 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void) proc_create("loadavg", 0, NULL, &loadavg_proc_fops); return 0; } -module_init(proc_loadavg_init); +fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239cfd97..7445af0b1aa 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,8 +1,8 @@ #include <linux/fs.h> -#include <linux/hugetlb.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> @@ -10,7 +10,8 @@ #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> -#include <asm/atomic.h> +#include <linux/atomic.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" @@ -23,10 +24,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; - unsigned long allowed; struct vmalloc_info vmi; long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; int lru; /* @@ -36,11 +40,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); - allowed = ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -49,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + /* * Tagged format, for easy grepping and expansion. */ seq_printf(m, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -101,12 +135,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), + K(available), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), @@ -143,13 +181,17 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), - K(allowed), + K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) + ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) #endif ); @@ -178,4 +220,4 @@ static int __init proc_meminfo_init(void) proc_create("meminfo", 0, NULL, &meminfo_proc_fops); return 0; } -module_init(proc_meminfo_init); +fs_initcall(proc_meminfo_init); diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c deleted file mode 100644 index 8ae221dfd01..00000000000 --- a/fs/proc/mmu.c +++ /dev/null @@ -1,60 +0,0 @@ -/* mmu.c: mmu memory info files - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/spinlock.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <asm/pgtable.h> -#include "internal.h" - -void get_vmalloc_info(struct vmalloc_info *vmi) -{ - struct vm_struct *vma; - unsigned long free_area_size; - unsigned long prev_end; - - vmi->used = 0; - - if (!vmlist) { - vmi->largest_chunk = VMALLOC_TOTAL; - } - else { - vmi->largest_chunk = 0; - - prev_end = VMALLOC_START; - - read_lock(&vmlist_lock); - - for (vma = vmlist; vma; vma = vma->next) { - unsigned long addr = (unsigned long) vma->addr; - - /* - * Some archs keep another range for modules in vmlist - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; - - vmi->used += vma->size; - - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; - - prev_end = vma->size + addr; - } - - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; - - read_unlock(&vmlist_lock); - } -} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 00000000000..89026095f2b --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,297 @@ +#include <linux/proc_fs.h> +#include <linux/nsproxy.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, +#endif + &mntns_operations, +}; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static const struct inode_operations ns_inode_operations = { + .setattr = proc_setattr, +}; + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_delete = always_delete_dentry, + .d_dname = ns_dname, +}; + +static struct dentry *proc_ns_get_dentry(struct super_block *sb, + struct task_struct *task, const struct proc_ns_operations *ns_ops) +{ + struct dentry *dentry, *result; + struct inode *inode; + struct proc_inode *ei; + struct qstr qname = { .name = "", }; + void *ns; + + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + dentry = d_alloc_pseudo(sb, &qname); + if (!dentry) { + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + inode = iget_locked(sb, ns_ops->inum(ns)); + if (!inode) { + dput(dentry); + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + ei = PROC_I(inode); + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns.ns_ops = ns_ops; + ei->ns.ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } + + d_set_d_op(dentry, &ns_dentry_operations); + result = d_instantiate_unique(dentry, inode); + if (result) { + dput(dentry); + dentry = result; + } + + return dentry; +} + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct proc_inode *ei = PROC_I(inode); + struct task_struct *task; + struct path ns_path; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); + if (IS_ERR(ns_path.dentry)) { + error = ERR_CAST(ns_path.dentry); + goto out_put_task; + } + + ns_path.mnt = mntget(nd->path.mnt); + nd_jump_link(nd, &ns_path); + error = NULL; + +out_put_task: + put_task_struct(task); +out: + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; + struct task_struct *task; + void *ns; + char name[50]; + int res = -EACCES; + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + res = -ENOENT; + ns = ns_ops->get(task); + if (!ns) + goto out_put_task; + + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); + res = readlink_copy(buffer, buflen, name); + ns_ops->put(ns); +out_put_task: + put_task_struct(task); +out: + return res; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + +static int proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns.ns_ops = ns_ops; + + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, 0)) + return 0; +out: + return -ENOENT; +} + +static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + const struct proc_ns_operations **entry, **last; + + if (!task) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) + goto out; + entry = ns_entries + (ctx->pos - 2); + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + const struct proc_ns_operations *ops = *entry; + if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops)) + break; + ctx->pos++; + entry++; + } +out: + put_task_struct(task); + return 0; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .iterate = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + int error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = -ENOENT; + + if (!task) + goto out_no_task; + + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + if (entry == last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return ERR_PTR(error); +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + +struct proc_ns *get_proc_ns(struct inode *inode) +{ + return &PROC_I(inode)->ns; +} + +bool proc_ns_inode(struct inode *inode) +{ + return inode->i_fop == &ns_file_operations; +} diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 9fe7d7ebe11..d4a35746cab 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -21,7 +21,6 @@ #include <linux/mmzone.h> #include <linux/pagemap.h> #include <linux/swap.h> -#include <linux/slab.h> #include <linux/smp.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> @@ -40,19 +39,20 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; flags = region->vm_flags; file = region->vm_file; if (file) { - struct inode *inode = region->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(region->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", region->vm_start, region->vm_end, flags & VM_READ ? 'r' : '-', @@ -60,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', ((loff_t)region->vm_pgoff) << PAGE_SHIFT, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - len = 25 + sizeof(void *) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } @@ -134,4 +131,4 @@ static int __init proc_nommu_init(void) return 0; } -module_init(proc_nommu_init); +fs_initcall(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index 180cf5a0bd6..e647c55275d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, ppage = pfn_to_page(pfn); else ppage = NULL; - if (!ppage) + if (!ppage || PageSlab(ppage)) pcount = 0; else pcount = page_mapcount(ppage); @@ -115,16 +115,27 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; - - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || + PageAnon(compound_head(page)))) + u |= 1 << KPF_THP; /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -146,7 +157,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif @@ -207,4 +218,4 @@ static int __init proc_page_init(void) proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); return 0; } -module_init(proc_page_init); +fs_initcall(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c deleted file mode 100644 index 123257bb356..00000000000 --- a/fs/proc/proc_devtree.c +++ /dev/null @@ -1,234 +0,0 @@ -/* - * proc_devtree.c - handles /proc/device-tree - * - * Copyright 1997 Paul Mackerras - */ -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <asm/prom.h> -#include <asm/uaccess.h> -#include "internal.h" - -#ifndef HAVE_ARCH_DEVTREE_FIXUPS -static inline void set_node_proc_entry(struct device_node *np, - struct proc_dir_entry *de) -{ -} -#endif - -static struct proc_dir_entry *proc_device_tree; - -/* - * Supply data on a read from /proc/device-tree/node/property. - */ -static int property_proc_show(struct seq_file *m, void *v) -{ - struct property *pp = m->private; - - seq_write(m, pp->value, pp->length); - return 0; -} - -static int property_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, property_proc_show, PDE(inode)->data); -} - -static const struct file_operations property_proc_fops = { - .owner = THIS_MODULE, - .open = property_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * For a node with a name like "gc@10", we make symlinks called "gc" - * and "@10" to it. - */ - -/* - * Add a property to a node - */ -static struct proc_dir_entry * -__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, - const char *name) -{ - struct proc_dir_entry *ent; - - /* - * Unfortunately proc_register puts each new entry - * at the beginning of the list. So we rearrange them. - */ - ent = proc_create_data(name, - strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, - de, &property_proc_fops, pp); - if (ent == NULL) - return NULL; - - if (!strncmp(name, "security-", 9)) - ent->size = 0; /* don't leak number of password chars */ - else - ent->size = pp->length; - - return ent; -} - - -void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) -{ - __proc_device_tree_add_prop(pde, prop, prop->name); -} - -void proc_device_tree_remove_prop(struct proc_dir_entry *pde, - struct property *prop) -{ - remove_proc_entry(prop->name, pde); -} - -void proc_device_tree_update_prop(struct proc_dir_entry *pde, - struct property *newprop, - struct property *oldprop) -{ - struct proc_dir_entry *ent; - - for (ent = pde->subdir; ent != NULL; ent = ent->next) - if (ent->data == oldprop) - break; - if (ent == NULL) { - printk(KERN_WARNING "device-tree: property \"%s\" " - " does not exist\n", oldprop->name); - } else { - ent->data = newprop; - ent->size = newprop->length; - } -} - -/* - * Various dodgy firmware might give us nodes and/or properties with - * conflicting names. That's generally ok, except for exporting via /proc, - * so munge names here to ensure they're unique. - */ - -static int duplicate_name(struct proc_dir_entry *de, const char *name) -{ - struct proc_dir_entry *ent; - int found = 0; - - spin_lock(&proc_subdir_lock); - - for (ent = de->subdir; ent != NULL; ent = ent->next) { - if (strcmp(ent->name, name) == 0) { - found = 1; - break; - } - } - - spin_unlock(&proc_subdir_lock); - - return found; -} - -static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, - const char *name) -{ - char *fixed_name; - int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ - int i = 1, size; - -realloc: - fixed_name = kmalloc(fixup_len, GFP_KERNEL); - if (fixed_name == NULL) { - printk(KERN_ERR "device-tree: Out of memory trying to fixup " - "name \"%s\"\n", name); - return name; - } - -retry: - size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); - size++; /* account for NULL */ - - if (size > fixup_len) { - /* We ran out of space, free and reallocate. */ - kfree(fixed_name); - fixup_len = size; - goto realloc; - } - - if (duplicate_name(de, fixed_name)) { - /* Multiple duplicates. Retry with a different offset. */ - i++; - goto retry; - } - - printk(KERN_WARNING "device-tree: Duplicate name in %s, " - "renamed to \"%s\"\n", np->full_name, fixed_name); - - return fixed_name; -} - -/* - * Process a node, adding entries for its children and its properties. - */ -void proc_device_tree_add_node(struct device_node *np, - struct proc_dir_entry *de) -{ - struct property *pp; - struct proc_dir_entry *ent; - struct device_node *child; - const char *p; - - set_node_proc_entry(np, de); - for (child = NULL; (child = of_get_next_child(np, child));) { - /* Use everything after the last slash, or the full name */ - p = strrchr(child->full_name, '/'); - if (!p) - p = child->full_name; - else - ++p; - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = proc_mkdir(p, de); - if (ent == NULL) - break; - proc_device_tree_add_node(child, ent); - } - of_node_put(child); - - for (pp = np->properties; pp != NULL; pp = pp->next) { - p = pp->name; - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = __proc_device_tree_add_prop(de, pp, p); - if (ent == NULL) - break; - } -} - -/* - * Called on initialization to set up the /proc/device-tree subtree - */ -void __init proc_device_tree_init(void) -{ - struct device_node *root; - - proc_device_tree = proc_mkdir("device-tree", NULL); - if (proc_device_tree == NULL) - return; - root = of_find_node_by_path("/"); - if (root == NULL) { - printk(KERN_ERR "/proc/device-tree: can't find root\n"); - return; - } - proc_device_tree_add_node(root, proc_device_tree); - of_node_put(root); -} diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 04d1270f1c3..4677bb7dc7c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -14,6 +14,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/module.h> @@ -25,6 +26,10 @@ #include "internal.h" +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} static struct net *get_proc_net(const struct inode *inode) { @@ -118,7 +123,7 @@ static struct net *get_proc_task_net(struct inode *dir) } static struct dentry *proc_tgid_net_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; @@ -155,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = { .getattr = proc_tgid_net_getattr, }; -static int proc_tgid_net_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; - net = get_proc_task_net(filp->f_path.dentry->d_inode); + net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + ret = proc_readdir_de(net->proc_net, file, ctx); put_net(net); } return ret; @@ -173,38 +177,24 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_tgid_net_readdir, + .iterate = proc_tgid_net_readdir, }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, mode_t mode, const struct file_operations *fops) -{ - return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ - remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd), GFP_KERNEL); + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); if (!netd) goto out; netd->data = net; netd->nlink = 2; - netd->name = "net"; netd->namelen = 3; netd->parent = &proc_root; + memcpy(netd->name, "net", 4); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6ff9981f0a1..71290463a1d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -3,8 +3,14 @@ */ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/poll.h> #include <linux/proc_fs.h> +#include <linux/printk.h> #include <linux/security.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/module.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -13,6 +19,379 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IFDIR|S_IRUGO|S_IXUGO, + }, + { } +}; +static struct ctl_table_root sysctl_table_root = { + .default_set.dir.header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table }}, + .ctl_table_arg = root_table, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + pr_cont("%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int minlen; + int cmp; + + minlen = len1; + if (minlen > len2) + minlen = len2; + + cmp = memcmp(name1, name2, minlen); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; + + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; + } + } + return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + pr_err("sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + pr_cont("/%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_node *node, struct ctl_table *table) +{ + head->ctl_table = table; + head->ctl_table_arg = table; + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; + head->node = node; + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) + node->header = head; + } +} + +static void erase_header(struct ctl_table_header *head) +{ + struct ctl_table *entry; + for (entry = head->ctl_table; entry->procname; entry++) + erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ + struct ctl_table *entry; + int err; + + dir->header.nreg++; + header->parent = dir; + err = insert_links(header); + if (err) + goto fail_links; + for (entry = header->ctl_table; entry->procname; entry++) { + err = insert_entry(header, entry); + if (err) + goto fail; + } + return 0; +fail: + erase_header(header); + put_links(header); +fail_links: + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + erase_header(p); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + BUG_ON(!head); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + spin_lock(&sysctl_lock); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + spin_unlock(&sysctl_lock); + return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ + struct ctl_node *ctl_node; + + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; + } + return NULL; +} + +static void first_entry(struct ctl_dir *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = NULL; + struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; + + spin_lock(&sysctl_lock); + ctl_node = first_usable_entry(rb_first(&dir->root)); + spin_unlock(&sysctl_lock); + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +void register_sysctl_root(struct ctl_table_root *root) +{ +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) + mode >>= 6; + else if (in_egroup_p(GLOBAL_ROOT_GID)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +{ + struct ctl_table_root *root = head->root; + int mode; + + if (root->permissions) + mode = root->permissions(head, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -23,21 +402,21 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (!inode) goto out; + inode->i_ino = get_next_ino(); + sysctl_head_get(head); ei = PROC_I(inode); ei->sysctl = head; ei->sysctl_entry = table; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ inode->i_mode = table->mode; - if (!table->child) { + if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - inode->i_nlink = 0; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } @@ -45,83 +424,54 @@ out: return inode; } -static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) -{ - int len; - for ( ; p->procname; p++) { - - if (!p->procname) - continue; - - len = strlen(p->procname); - if (len != name->len) - continue; - - if (memcmp(p->procname, name->name, len) != 0) - continue; - - /* I have a match */ - return p; - } - return NULL; -} - static struct ctl_table_header *grab_header(struct inode *inode) { - if (PROC_I(inode)->sysctl) - return sysctl_head_grab(PROC_I(inode)->sysctl); - else - return sysctl_head_next(NULL); + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &sysctl_table_root.default_set.dir.header; + return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct ctl_table_header *head = grab_header(dir); - struct ctl_table *table = PROC_I(dir)->sysctl_entry; struct ctl_table_header *h = NULL; struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; + int ret; if (IS_ERR(head)) return ERR_CAST(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } - - table = table ? table->child : head->ctl_table; - - p = find_in_table(table, name); - if (!p) { - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - p = find_in_table(h->attached_by, name); - if (p) - break; - } - } + ctl_dir = container_of(head, struct ctl_dir, header); + p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + } + err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (h) - sysctl_head_finish(h); - if (!inode) goto out; err = NULL; - dentry->d_op = &proc_sys_dentry_operations; + d_set_d_op(dentry, &proc_sys_dentry_operations); d_add(dentry, inode); out: + if (h) + sysctl_head_finish(h); sysctl_head_finish(head); return err; } @@ -129,7 +479,7 @@ out: static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, size_t count, loff_t *ppos, int write) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; ssize_t error; @@ -143,7 +493,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, * and won't be until we finish. */ error = -EPERM; - if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ @@ -174,13 +524,61 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + sysctl_head_finish(head); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = file_inode(filp); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned int ret = DEFAULT_POLLMASK; + unsigned long event; -static int proc_sys_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, + /* sysctl was unregistered */ + if (IS_ERR(head)) + return POLLERR | POLLHUP; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + event = (unsigned long)filp->private_data; + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + sysctl_head_finish(head); + + return ret; +} + +static bool proc_sys_fill_cache(struct file *file, + struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; @@ -197,99 +595,89 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { dput(child); - return -ENOMEM; + return false; } else { - child->d_op = &proc_sys_dentry_operations; + d_set_d_op(child, &proc_sys_dentry_operations); d_add(child, inode); } } else { - return -ENOMEM; + return false; } } inode = child->d_inode; ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); - return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); + return dir_emit(ctx, qname.name, qname.len, ino, type); +} + +static bool proc_sys_link_fill_cache(struct file *file, + struct dir_context *ctx, + struct ctl_table_header *head, + struct ctl_table *table) +{ + bool ret = true; + head = sysctl_head_grab(head); + + if (S_ISLNK(table->mode)) { + /* It is not an error if we can not follow the link ignore it */ + int err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + } + + ret = proc_sys_fill_cache(file, ctx, head, table); +out: + sysctl_head_finish(head); + return ret; } static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, - void *dirent, filldir_t filldir) + struct dir_context *ctx) { + bool res; - for (; table->procname; table++, (*pos)++) { - int res; - - /* Can't do anything without a proc name */ - if (!table->procname) - continue; + if ((*pos)++ < ctx->pos) + return true; - if (*pos < file->f_pos) - continue; + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, ctx, head, table); + else + res = proc_sys_fill_cache(file, ctx, head, table); - res = proc_sys_fill_cache(file, dirent, filldir, head, table); - if (res) - return res; + if (res) + ctx->pos = *pos; - file->f_pos = *pos + 1; - } - return 0; + return res; } -static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; + struct ctl_table *entry; + struct ctl_dir *ctl_dir; unsigned long pos; - int ret = -EINVAL; if (IS_ERR(head)) return PTR_ERR(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } + ctl_dir = container_of(head, struct ctl_dir, header); - table = table ? table->child : head->ctl_table; + if (!dir_emit_dots(file, ctx)) + return 0; - ret = 0; - /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, - inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - } - if (filp->f_pos == 1) { - if (filldir(dirent, "..", 2, filp->f_pos, - parent_ino(dentry), DT_DIR) < 0) - goto out; - filp->f_pos++; - } pos = 2; - ret = scan(head, table, &pos, filp, dirent, filldir); - if (ret) - goto out; - - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); - if (ret) { + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { + if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } - ret = 1; -out: sysctl_head_finish(head); - return ret; + return 0; } static int proc_sys_permission(struct inode *inode, int mask) @@ -314,7 +702,7 @@ static int proc_sys_permission(struct inode *inode, int mask) if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask); + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; @@ -329,10 +717,12 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; error = inode_change_ok(inode, attr); - if (!error) - error = inode_setattr(inode, attr); + if (error) + return error; - return error; + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; } static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -353,12 +743,16 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct } static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, .read = proc_sys_read, .write = proc_sys_write, + .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { - .readdir = proc_sys_readdir, + .read = generic_read_dir, + .iterate = proc_sys_readdir, .llseek = generic_file_llseek, }; @@ -375,25 +769,51 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { + if (flags & LOOKUP_RCU) + return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_delete(struct dentry *dentry) +static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } -static int proc_sys_compare(struct dentry *dir, struct qstr *qstr, - struct qstr *name) +static int sysctl_is_seen(struct ctl_table_header *p) { - struct dentry *dentry = container_of(qstr, struct dentry, d_name); - if (qstr->len != name->len) + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct ctl_table_header *head; + struct inode *inode; + + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + /* AV: can it, indeed? */ + inode = ACCESS_ONCE(dentry->d_inode); + if (!inode) + return 1; + if (name->len != len) return 1; - if (memcmp(qstr->name, name->name, name->len)) + if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl); + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { @@ -402,6 +822,753 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + entry = find_entry(&head, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + struct ctl_node *node; + char *new_name; + + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); + if (!new) + return NULL; + + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->dir.header.root, set, node, table); + + return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_set *set = dir->header.set; + struct ctl_dir *subdir, *new = NULL; + int err; + + spin_lock(&sysctl_lock); + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + /* Was the subdir added while we dropped the lock? */ + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + /* Nope. Use the our freshly made directory entry. */ + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) + goto failed; + subdir = new; +found: + subdir->header.nreg++; +failed: + if (unlikely(IS_ERR(subdir))) { + pr_err("sysctl could not get directory: "); + sysctl_print_dir(dir); + pr_cont("/%*.*s %ld\n", + namelen, namelen, name, PTR_ERR(subdir)); + } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + ret = 0; + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root, namespaces); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); + + va_end(args); + return -EINVAL; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ + int err = 0; + for (; table->procname; table++) { + if (table->child) + err = sysctl_err(path, table, "Not a file"); + + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + err = sysctl_err(path, table, "No data"); + if (!table->maxlen) + err = sysctl_err(path, table, "No maxlen"); + } + if (!table->proc_handler) + err = sysctl_err(path, table, "No proc_handler"); + + if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) + err = sysctl_err(path, table, "bogus .mode 0%o", + table->mode); + } + return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + struct ctl_node *node; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + for (entry = table; entry->procname; entry++) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); + link_name = (char *)&link_table[nr_entries + 1]; + + for (link = link_table, entry = table; entry->procname; link++, entry++) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + } + init_header(links, dir->header.root, dir->header.set, node, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent = NULL; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table) +{ + struct ctl_table_root *root = set->dir.header.root; + struct ctl_table_header *header; + const char *name, *nextname; + struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + for (entry = table; entry->procname; entry++) + nr_entries++; + + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + if (!header) + return NULL; + + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + dir = &set->dir; + /* Reference moved down the diretory tree get_subdir */ + dir->header.nreg++; + spin_unlock(&sysctl_lock); + + /* Find the directory for the ctl_table */ + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + goto fail; + } + + spin_lock(&sysctl_lock); + if (insert_header(dir, header)) + goto fail_put_dir_locked; + + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); + + return header; + +fail_put_dir_locked: + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, struct ctl_table_set *set, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + for (new = files, entry = table; entry->procname; entry++) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(set, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + for (entry = table; entry->procname; entry++) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + set, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = table; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } + if (nr_subheaders == 1) { + header = __register_sysctl_table(set, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; + header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + set, table)) + goto err_register_leaves; + } + +out: + kfree(new_path); + return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + for (entry = header->ctl_table; entry->procname; entry++) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + pr_err("sysctl link missing during unregister: "); + sysctl_print_dir(parent); + pr_cont("/%s\n", name); + } + } +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ + struct ctl_dir *parent = header->parent; + + if (--header->nreg) + return; + + put_links(header); + start_unregistering(header); + if (!--header->count) + kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + int nr_subheaders; + might_sleep(); + + if (header == NULL) + return; + + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + + spin_lock(&sysctl_lock); + drop_sysctl_table(header); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ + memset(set, 0, sizeof(*set)); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} + int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; @@ -410,5 +1577,6 @@ int __init proc_sys_init(void) proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return 0; + + return sysctl_init(); } diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 83adcc86943..cb761f01030 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p, } switch (p->type) { case TTY_DRIVER_TYPE_SYSTEM: - seq_printf(m, "system"); + seq_puts(m, "system"); if (p->subtype == SYSTEM_TYPE_TTY) - seq_printf(m, ":/dev/tty"); + seq_puts(m, ":/dev/tty"); else if (p->subtype == SYSTEM_TYPE_SYSCONS) - seq_printf(m, ":console"); + seq_puts(m, ":console"); else if (p->subtype == SYSTEM_TYPE_CONSOLE) - seq_printf(m, ":vtmaster"); + seq_puts(m, ":vtmaster"); break; case TTY_DRIVER_TYPE_CONSOLE: - seq_printf(m, "console"); + seq_puts(m, "console"); break; case TTY_DRIVER_TYPE_SERIAL: - seq_printf(m, "serial"); + seq_puts(m, "serial"); break; case TTY_DRIVER_TYPE_PTY: if (p->subtype == PTY_TYPE_MASTER) - seq_printf(m, "pty:master"); + seq_puts(m, "pty:master"); else if (p->subtype == PTY_TYPE_SLAVE) - seq_printf(m, "pty:slave"); + seq_puts(m, "pty:slave"); else - seq_printf(m, "pty"); + seq_puts(m, "pty"); break; default: seq_printf(m, "type:%d.%d", p->type, p->subtype); @@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v) /* pseudo-drivers first */ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); - seq_printf(m, "system:/dev/tty\n"); + seq_puts(m, "system:/dev/tty\n"); seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); - seq_printf(m, "system:console\n"); + seq_puts(m, "system:console\n"); #ifdef CONFIG_UNIX98_PTYS seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); - seq_printf(m, "system\n"); + seq_puts(m, "system\n"); #endif #ifdef CONFIG_VT seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); - seq_printf(m, "system:vtmaster\n"); + seq_puts(m, "system:vtmaster\n"); #endif } diff --git a/fs/proc/root.c b/fs/proc/root.c index b080b791d9e..5dbadecb234 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,8 +16,10 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> +#include <linux/parser.h> #include "internal.h" @@ -28,62 +30,116 @@ static int proc_test_super(struct super_block *sb, void *data) static int proc_set_super(struct super_block *sb, void *data) { - struct pid_namespace *ns; + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; +} + +enum { + Opt_gid, Opt_hidepid, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, tokens, args); + switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = make_kgid(current_user_ns(), option); + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; - ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - return set_anon_super(sb, NULL); + sync_filesystem(sb); + return !proc_parse_options(data, pid); } -static int proc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; - - if (proc_mnt) { - /* Seed the root directory with a pid so it doesn't need - * to be special in base.c. I would do this earlier but - * the only task alive when /proc is mounted the first time - * is the init_task and it doesn't have any pids. - */ - ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); - if (!ei->pid) - ei->pid = find_get_pid(1); - } + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else - ns = current->nsproxy->pid_ns; + options = NULL; + } else { + ns = task_active_pid_ns(current); + options = data; - sb = sget(fs_type, proc_test_super, proc_set_super, ns); + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + + sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); + + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } if (!sb->s_root) { - sb->s_flags = flags; err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); - return err; - } - - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); + return ERR_PTR(err); } sb->s_flags |= MS_ACTIVE; - ns->proc_mnt = mnt; } - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } static void proc_kill_sb(struct super_block *sb) @@ -91,14 +147,17 @@ static void proc_kill_sb(struct super_block *sb) struct pid_namespace *ns; ns = (struct pid_namespace *)sb->s_fs_info; + if (ns->proc_self) + dput(ns->proc_self); kill_anon_super(sb); put_pid_ns(ns); } static struct file_system_type proc_fs_type = { .name = "proc", - .get_sb = proc_get_sb, + .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -109,13 +168,8 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - err = PTR_ERR(proc_mnt); - if (IS_ERR(proc_mnt)) { - unregister_filesystem(&proc_fs_type); - return; - } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -131,9 +185,6 @@ void __init proc_root_init(void) proc_mkdir("openprom", NULL); #endif proc_tty_init(); -#ifdef CONFIG_PROC_DEVICETREE - proc_device_tree_init(); -#endif proc_mkdir("bus", NULL); proc_sys_init(); } @@ -146,30 +197,24 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } -static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, nd)) { + if (!proc_lookup(dir, dentry, flags)) return NULL; - } - return proc_pid_lookup(dir, dentry, nd); + return proc_pid_lookup(dir, dentry, flags); } -static int proc_root_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_root_readdir(struct file *file, struct dir_context *ctx) { - unsigned int nr = filp->f_pos; - int ret; - - if (nr < FIRST_PROCESS_ENTRY) { - int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) + if (ctx->pos < FIRST_PROCESS_ENTRY) { + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) return error; - filp->f_pos = FIRST_PROCESS_ENTRY; + ctx->pos = FIRST_PROCESS_ENTRY; } - ret = proc_pid_readdir(filp, dirent, filldir); - return ret; + return proc_pid_readdir(file, ctx); } /* @@ -179,7 +224,8 @@ static int proc_root_readdir(struct file * filp, */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, - .readdir = proc_root_readdir, + .iterate = proc_root_readdir, + .llseek = default_llseek, }; /* @@ -196,13 +242,13 @@ static const struct inode_operations proc_root_inode_operations = { struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, - .name = "/proc", .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .count = ATOMIC_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) @@ -213,16 +259,11 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) if (IS_ERR(mnt)) return PTR_ERR(mnt); + ns->proc_mnt = mnt; return 0; } void pid_ns_release_proc(struct pid_namespace *ns) { - mntput(ns->proc_mnt); + kern_unmount(ns->proc_mnt); } - -EXPORT_SYMBOL(proc_symlink); -EXPORT_SYMBOL(proc_mkdir); -EXPORT_SYMBOL(create_proc_entry); -EXPORT_SYMBOL(proc_create_data); -EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/self.c b/fs/proc/self.c new file mode 100644 index 00000000000..4348bb8907c --- /dev/null +++ b/fs/proc/self.c @@ -0,0 +1,84 @@ +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return readlink_copy(buffer, buflen, tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = kfree_put_link, +}; + +static unsigned self_inum; + +int proc_setup_self(struct super_block *s) +{ + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *self; + + mutex_lock(&root_inode->i_mutex); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + } else { + dput(self); + self = ERR_PTR(-ENOMEM); + } + } else { + self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(self)) { + pr_err("proc_fill_super: can't allocate /proc/self\n"); + return PTR_ERR(self); + } + ns->proc_self = self; + return 0; +} + +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); +} diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 1807c2419f1..ad8a77f94be 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v) { int i, j; - seq_printf(p, " "); + seq_puts(p, " "); for_each_possible_cpu(i) seq_printf(p, "CPU%-8d", i); - seq_printf(p, "\n"); + seq_putc(p, '\n'); for (i = 0; i < NR_SOFTIRQS; i++) { - seq_printf(p, "%8s:", softirq_to_name[i]); + seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); - seq_printf(p, "\n"); + seq_putc(p, '\n'); } return 0; } @@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void) proc_create("softirqs", 0, NULL, &proc_softirqs_operations); return 0; } -module_init(proc_softirqs_init); +fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index b9b7aad2003..bf2d03f8fd3 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -1,6 +1,5 @@ #include <linux/cpumask.h> #include <linux/fs.h> -#include <linux/gfp.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> @@ -10,7 +9,8 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/irqnr.h> -#include <asm/cputime.h> +#include <linux/cputime.h> +#include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 @@ -18,44 +18,94 @@ #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif -#ifndef arch_idle_time -#define arch_idle_time(cpu) 0 + +#ifdef arch_idle_time + +static cputime64_t get_idle_time(int cpu) +{ + cputime64_t idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + cputime64_t iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else + +static u64 get_idle_time(int cpu) +{ + u64 idle, idle_time = -1ULL; + + if (cpu_online(cpu)) + idle_time = get_cpu_idle_time_us(cpu, NULL); + + if (idle_time == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + else + idle = usecs_to_cputime64(idle_time); + + return idle; +} + +static u64 get_iowait_time(int cpu) +{ + u64 iowait, iowait_time = -1ULL; + + if (cpu_online(cpu)) + iowait_time = get_cpu_iowait_time_us(cpu, NULL); + + if (iowait_time == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + else + iowait = usecs_to_cputime64(iowait_time); + + return iowait; +} + #endif static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest, guest_nice; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; - unsigned int per_irq_sum; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); - for_each_irq_nr(j) { - sum += kstat_irqs_cpu(j, i); - } + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { @@ -67,57 +117,49 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); - for_each_online_cpu(i) { + seq_puts(p, "cpu "); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; - idle = kstat_cpu(i).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(i)); - iowait = kstat_cpu(i).cpustat.iowait; - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - guest_nice = kstat_cpu(i).cpustat.guest_nice; - seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - i, - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(i); + iowait = get_iowait_time(i); + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); } seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_nr(j) { - per_irq_sum = 0; - for_each_possible_cpu(i) - per_irq_sum += kstat_irqs_cpu(j, i); - - seq_printf(p, " %u", per_irq_sum); - } + for_each_irq_nr(j) + seq_put_decimal_ull(p, ' ', kstat_irqs(j)); seq_printf(p, "\nctxt %llu\n" @@ -134,34 +176,19 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) - seq_printf(p, " %u", per_softirq_sums[i]); - seq_printf(p, "\n"); + seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); + seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { - unsigned size = 4096 * (1 + num_possible_cpus() / 32); - char *buf; - struct seq_file *m; - int res; - - /* don't ask for more than the kmalloc() max size, currently 128 KB */ - if (size > 128 * 1024) - size = 128 * 1024; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + size_t size = 1024 + 128 * num_online_cpus(); + + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { @@ -176,4 +203,4 @@ static int __init proc_stat_init(void) proc_create("stat", 0, NULL, &proc_stat_operations); return 0; } -module_init(proc_stat_init); +fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a111c..cfa63ee92c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,13 +1,18 @@ #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/hugetlb.h> +#include <linux/huge_mm.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> +#include <linux/slab.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> +#include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -16,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib; + unsigned long data, text, lib, swap; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -36,25 +41,31 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n", + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); + (PTRS_PER_PTE * sizeof(pte_t) * + atomic_long_read(&mm->nr_ptes)) >> 10, + swap << (PAGE_SHIFT-10)); } unsigned long task_vsize(struct mm_struct *mm) @@ -62,29 +73,67 @@ unsigned long task_vsize(struct mm_struct *mm) return PAGE_SIZE * mm->total_vm; } -int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, file_rss); + *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } -static void pad_len_spaces(struct seq_file *m, int len) +#ifdef CONFIG_NUMA +/* + * These functions are for numa_maps but called in generic **maps seq_file + * ->start(), ->stop() ops. + * + * numa_maps scans all vmas under mmap_sem and checks their mempolicy. + * Each mempolicy object is controlled by reference counting. The problem here + * is how to avoid accessing dead mempolicy object. + * + * Because we're holding mmap_sem while reading seq_file, it's safe to access + * each vma's mempolicy, no vma objects will never drop refs to mempolicy. + * + * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy + * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). + * So, without task_lock(), we cannot trust get_vma_policy() because we cannot + * gurantee the task never exits under us. But taking task_lock() around + * get_vma_plicy() causes lock order problem. + * + * To access task->mempolicy without lock, we hold a reference count of an + * object pointed by task->mempolicy and remember it. This will guarantee + * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ + struct task_struct *task = priv->task; + + task_lock(task); + priv->task_mempolicy = task->mempolicy; + mpol_get(priv->task_mempolicy); + task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv) { - len = 25 + sizeof(void*) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + mpol_put(priv->task_mempolicy); } +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; + release_task_mempolicy(priv); up_read(&mm->mmap_sem); mmput(mm); } @@ -104,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ @@ -114,16 +163,16 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = get_pid_task(priv->pid, PIDTYPE_PID); if (!priv->task) - return NULL; + return ERR_PTR(-ESRCH); - mm = mm_for_maps(priv->task); - if (!mm) - return NULL; + mm = mm_access(priv->task, PTRACE_MODE_READ); + if (!mm || IS_ERR(mm)) + return mm; down_read(&mm->mmap_sem); - tail_vma = get_gate_vma(priv->task); + tail_vma = get_gate_vma(priv->task->mm); priv->tail_vma = tail_vma; - + hold_task_mempolicy(priv); /* Start with last addr hint */ vma = find_vma(mm, last_addr); if (last_addr && vma) { @@ -150,6 +199,7 @@ out: if (vma) return vma; + release_task_mempolicy(priv); /* End of vmas has been reached */ m->version = (tail_vma != NULL)? 0: -1UL; up_read(&mm->mmap_sem); @@ -175,7 +225,8 @@ static void m_stop(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - vma_stop(priv, vma); + if (!IS_ERR(vma)) + vma_stop(priv, vma); if (priv->task) put_task_struct(priv->task); } @@ -199,109 +250,160 @@ static int do_maps_open(struct inode *inode, struct file *file, return ret; } -static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - int flags = vma->vm_flags; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; + unsigned long start, end; dev_t dev = 0; - int len; + const char *name = NULL; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, - vma->vm_end, + /* We don't show the stack guard page in /proc/maps */ + start = vma->vm_start; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + start, + end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (file) { - pad_len_spaces(m, len); + seq_pad(m, ' '); seq_path(m, &file->f_path, "\n"); - } else { - const char *name = arch_vma_name(vma); - if (!name) { - if (mm) { - if (vma->vm_start <= mm->start_brk && - vma->vm_end >= mm->brk) { - name = "[heap]"; - } else if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - name = "[stack]"; - } else { - unsigned long stack_start; - struct proc_maps_private *pmp; - - pmp = m->private; - stack_start = pmp->task->stack_start; - - if (vma->vm_start <= stack_start && - vma->vm_end >= stack_start) { - pad_len_spaces(m, len); - seq_printf(m, - "[threadstack:%08lx]", -#ifdef CONFIG_STACK_GROWSUP - vma->vm_end - stack_start -#else - stack_start - vma->vm_start -#endif - ); - } - } + goto done; + } + + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + + name = arch_vma_name(vma); + if (!name) { + pid_t tid; + + if (!mm) { + name = "[vdso]"; + goto done; + } + + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + goto done; + } + + tid = vm_is_stack(task, vma, is_pid); + + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) { + name = "[stack]"; } else { - name = "[vdso]"; + /* Thread stack in /proc/PID/maps */ + seq_pad(m, ' '); + seq_printf(m, "[stack:%d]", tid); } } - if (name) { - pad_len_spaces(m, len); - seq_puts(m, name); - } + } + +done: + if (name) { + seq_pad(m, ' '); + seq_puts(m, name); } seq_putc(m, '\n'); } -static int show_map(struct seq_file *m, void *v) +static int show_map(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - show_map_vma(m, vma); + show_map_vma(m, vma, is_pid); if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; return 0; } +static int show_pid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 0); +} + static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_map + .show = show_pid_map }; -static int maps_open(struct inode *inode, struct file *file) +static const struct seq_operations proc_tid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } -const struct file_operations proc_maps_operations = { - .open = maps_open, +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, @@ -335,61 +437,151 @@ struct mem_size_stats { unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + unsigned long anonymous; + unsigned long anonymous_thp; unsigned long swap; + unsigned long nonlinear; u64 pss; }; -static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) + +static void smaps_pte_entry(pte_t ptent, unsigned long addr, + unsigned long ptent_size, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; - pte_t *pte, ptent; - spinlock_t *ptl; - struct page *page; + pgoff_t pgoff = linear_page_index(vma, addr); + struct page *page = NULL; int mapcount; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - - if (is_swap_pte(ptent)) { - mss->swap += PAGE_SIZE; - continue; - } + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); + + if (!non_swap_entry(swpent)) + mss->swap += ptent_size; + else if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } else if (pte_file(ptent)) { + if (pte_to_pgoff(ptent) != pgoff) + mss->nonlinear += ptent_size; + } - if (!pte_present(ptent)) - continue; + if (!page) + return; + + if (PageAnon(page)) + mss->anonymous += ptent_size; + + if (page->index != pgoff) + mss->nonlinear += ptent_size; + + mss->resident += ptent_size; + /* Accumulate the size in pages that have been accessed. */ + if (pte_young(ptent) || PageReferenced(page)) + mss->referenced += ptent_size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + if (pte_dirty(ptent) || PageDirty(page)) + mss->shared_dirty += ptent_size; + else + mss->shared_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT) / mapcount; + } else { + if (pte_dirty(ptent) || PageDirty(page)) + mss->private_dirty += ptent_size; + else + mss->private_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT); + } +} - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + pte_t *pte; + spinlock_t *ptl; - mss->resident += PAGE_SIZE; - /* Accumulate the size in pages that have been accessed. */ - if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += PAGE_SIZE; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - if (pte_dirty(ptent)) - mss->shared_dirty += PAGE_SIZE; - else - mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; - } else { - if (pte_dirty(ptent)) - mss->private_dirty += PAGE_SIZE; - else - mss->private_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT); - } + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); + spin_unlock(ptl); + mss->anonymous_thp += HPAGE_PMD_SIZE; + return 0; } + + if (pmd_trans_unstable(pmd)) + return 0; + /* + * The mmap_sem held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; } -static int show_smap(struct seq_file *m, void *v) +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_NONLINEAR)] = "nl", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = "sd", +#endif + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + +static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; @@ -403,10 +595,11 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof mss); mss.vma = vma; + /* mmap_sem is held in m_start */ if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); - show_map_vma(m, vma); + show_map_vma(m, vma, is_pid); seq_printf(m, "Size: %8lu kB\n" @@ -417,9 +610,12 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "AnonHugePages: %8lu kB\n" "Swap: %8lu kB\n" "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -428,45 +624,151 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, + mss.anonymous >> 10, + mss.anonymous_thp >> 10, mss.swap >> 10, vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + + if (vma->vm_flags & VM_NONLINEAR) + seq_printf(m, "Nonlinear: %8lu kB\n", + mss.nonlinear >> 10); + + show_smap_vma_flags(m, vma); if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; return 0; } +static int show_pid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 0); +} + static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_smap + .show = show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_smap }; -static int smaps_open(struct inode *inode, struct file *file) +static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } -const struct file_operations proc_smaps_operations = { - .open = smaps_open, +static int tid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { + .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; +const struct file_operations proc_tid_smaps_operations = { + .open = tid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); + } + + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; + + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + if (!pte_present(ptent)) continue; @@ -483,10 +785,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -494,29 +792,47 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - long type; + enum clear_refs_types type; + int itype; + int rv; memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - if (strict_strtol(strstrip(buffer), 10, &type)) - return -EINVAL; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; - task = get_proc_task(file->f_path.dentry->d_inode); + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning!" + " See the linux/Documentation/vm/pagemap.txt for " + "details.\n"); + } + + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -527,14 +843,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * * Writing 3 to /proc/pid/clear_refs only affects file * mapped pages. + * + * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (type == CLEAR_REFS_ANON && vma->vm_file) continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; + if (type == CLEAR_REFS_SOFT_DIRTY) { + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -546,13 +870,23 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, + .llseek = noop_llseek, }; +typedef struct { + u64 pme; +} pagemap_entry_t; + struct pagemapread { - u64 __user *out, *end; + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool v2; }; -#define PM_ENTRY_BYTES sizeof(u64) +#define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) + +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) @@ -560,22 +894,29 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_FILE PM_STATUS(1LL) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 -static int add_to_pagemap(unsigned long addr, u64 pfn, +static inline pagemap_entry_t make_pme(u64 val) +{ + return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, struct pagemapread *pm) { - if (put_user(pfn, pm->out)) - return -EFAULT; - pm->out++; - if (pm->out >= pm->end) + pm->buffer[pm->pos++] = *pme; + if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } @@ -586,60 +927,136 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + err = add_to_pagemap(addr, &pme, pm); if (err) break; } return err; } -static u64 swap_pte_to_pagemap_entry(pte_t pte) +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - swp_entry_t e = pte_to_swp_entry(pte); - return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); + u64 frame, flags; + struct page *page = NULL; + int flags2 = 0; + + if (pte_present(pte)) { + frame = pte_pfn(pte); + flags = PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags = PM_SWAP; + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } else { + if (vma->vm_flags & VM_SOFTDIRTY) + flags2 |= __PM_SOFT_DIRTY; + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); + return; + } + + if (page && !PageAnon(page)) + flags |= PM_FILE; + if ((vma->vm_flags & VM_SOFTDIRTY)) + flags2 |= __PM_SOFT_DIRTY; + + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } -static u64 pte_to_pagemap_entry(pte_t pte) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { - u64 pme = 0; - if (is_swap_pte(pte)) - pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; - else if (pte_present(pte)) - pme = PM_PFRAME(pte_pfn(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - return pme; + /* + * Currently pmd for thp is always present because thp can not be + * swapped-out, migrated, or HWPOISONed (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) + *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); } +#else +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) +{ +} +#endif static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma; struct pagemapread *pm = walk->private; + spinlock_t *ptl; pte_t *pte; int err = 0; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); + if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + int pmd_flags2; + + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) + pmd_flags2 = __PM_SOFT_DIRTY; + else + pmd_flags2 = 0; + + for (; addr != end; addr += PAGE_SIZE) { + unsigned long offset; + + offset = (addr & ~PAGEMAP_WALK_MASK) >> + PAGE_SHIFT; + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + spin_unlock(ptl); + return err; + } + + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { - u64 pfn = PM_NOT_PRESENT; + int flags2; /* check to see if we've left 'vma' behind * and need a new, higher one */ - if (vma && (addr >= vma->vm_end)) + if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); + } /* check that 'vma' actually covers this address, * and that it isn't a huge page vma */ if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pfn = pte_to_pagemap_entry(*pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } - err = add_to_pagemap(addr, pfn, pm); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; } @@ -649,41 +1066,42 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } -static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +#ifdef CONFIG_HUGETLB_PAGE +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pte_t pte, int offset, int flags2) { - u64 pme = 0; if (pte_present(pte)) - pme = PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - return pme; + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | + PM_STATUS2(pm->v2, flags2) | + PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | + PM_STATUS2(pm->v2, flags2)); } -static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, - unsigned long end, struct mm_walk *walk) +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - struct vm_area_struct *vma; struct pagemapread *pm = walk->private; - struct hstate *hs = NULL; + struct vm_area_struct *vma; int err = 0; + int flags2; + pagemap_entry_t pme; vma = find_vma(walk->mm, addr); - if (vma) - hs = hstate_vma(vma); - for (; addr != end; addr += PAGE_SIZE) { - u64 pfn = PM_NOT_PRESENT; + WARN_ON_ONCE(!vma); - if (vma && (addr >= vma->vm_end)) { - vma = find_vma(walk->mm, addr); - if (vma) - hs = hstate_vma(vma); - } + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; - if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { - /* calculate pfn of the "raw" page in the hugepage. */ - int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT; - pfn = huge_pte_to_pagemap_entry(*pte, offset); - } - err = add_to_pagemap(addr, pfn, pm); + for (; addr != end; addr += PAGE_SIZE) { + int offset = (addr & ~hmask) >> PAGE_SHIFT; + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; } @@ -692,6 +1110,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, return err; } +#endif /* HUGETLB_PAGE */ /* * /proc/pid/pagemap - an array mapping virtual pages to pfns @@ -699,11 +1118,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * - * Bits 0-55 page frame number (PFN) if present + * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped - * Bits 5-55 swap offset if swapped + * Bits 5-54 swap offset if swapped * Bits 55-60 page shift (page size = 1<<page shift) - * Bit 61 reserved for future use + * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * @@ -720,72 +1139,46 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - struct page **pages, *page; - unsigned long uaddr, uend; + struct task_struct *task = get_proc_task(file_inode(file)); struct mm_struct *mm; struct pagemapread pm; - int pagecount; int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; + int copied = 0; if (!task) goto out; - ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_task; - ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_task; ret = 0; - if (!count) goto out_task; - mm = get_task_mm(task); - if (!mm) - goto out_task; - - - uaddr = (unsigned long)buf & PAGE_MASK; - uend = (unsigned long)(buf + count); - pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; - ret = 0; - if (pagecount == 0) - goto out_mm; - pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + pm.v2 = soft_dirty_cleared; + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); ret = -ENOMEM; - if (!pages) - goto out_mm; - - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, uaddr, pagecount, - 1, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); + if (!pm.buffer) + goto out_task; - if (ret < 0) + mm = mm_access(task, PTRACE_MODE_READ); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) goto out_free; - if (ret != pagecount) { - pagecount = ret; - ret = -EFAULT; - goto out_pages; - } - - pm.out = (u64 __user *)buf; - pm.end = (u64 __user *)(buf + count); - pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif pagemap_walk.mm = mm; pagemap_walk.private = ± @@ -804,56 +1197,350 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, * user buffer is tracked in "pm", and the walk * will stop when we hit the end of the buffer. */ - ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); - if (ret == PM_END_OF_BUFFER) - ret = 0; - /* don't need mmap_sem for these, but this looks cleaner */ - *ppos += (char __user *)pm.out - buf; - if (!ret) - ret = (char __user *)pm.out - buf; - -out_pages: - for (; pagecount; pagecount--) { - page = pages[pagecount-1]; - if (!PageReserved(page)) - SetPageDirty(page); - page_cache_release(page); + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + down_read(&mm->mmap_sem); + ret = walk_page_range(start_vaddr, end, &pagemap_walk); + up_read(&mm->mmap_sem); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_mm; + } + copied += len; + buf += len; + count -= len; } -out_free: - kfree(pages); + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + out_mm: mmput(mm); +out_free: + kfree(pm.buffer); out_task: put_task_struct(task); out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); + +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) +{ + int count = page_mapcount(page); + + md->pages += nr_pages; + if (pte_dirty || PageDirty(page)) + md->dirty += nr_pages; + + if (PageSwapCache(page)) + md->swapcache += nr_pages; + + if (PageActive(page) || PageUnevictable(page)) + md->active += nr_pages; + + if (PageWriteback(page)) + md->writeback += nr_pages; + + if (PageAnon(page)) + md->anon += nr_pages; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + + if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, md->vma, addr); + if (!page) + continue; + gather_stats(page, md, pte_dirty(*pte), 1); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (!pte_present(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte), 1); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v, int is_pid) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct task_struct *task = proc_priv->task; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + char buffer[64]; + int nid; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_puts(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_puts(m, " heap"); + } else { + pid_t tid = vm_is_stack(task, vma, is_pid); + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_puts(m, " stack"); + else + seq_printf(m, " stack:%d", tid); + } + } + + if (is_vm_hugetlb_page(vma)) + seq_puts(m, " huge"); + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(nid, N_MEMORY) + if (md->node[nid]) + seq_printf(m, " N%d=%lu", nid, md->node[nid]); +out: + seq_putc(m, '\n'); + + if (m->count < m->size) + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + return 0; +} + +static int show_pid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 0); +} static const struct seq_operations proc_pid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_numa_map, + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_numa_map, }; -static int numa_maps_open(struct inode *inode, struct file *file) +static const struct seq_operations proc_tid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_numa_map, +}; + +static int numa_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { - return do_maps_open(inode, file, &proc_pid_numa_maps_op); + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; } -const struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { + .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; -#endif + +const struct file_operations proc_tid_numa_maps_operations = { + .open = tid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 5d9fd64ef81..678455d2d68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -5,6 +5,7 @@ #include <linux/fs_struct.h> #include <linux/mount.h> #include <linux/ptrace.h> +#include <linux/slab.h> #include <linux/seq_file.h> #include "internal.h" @@ -91,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm) return vsize; } -int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) { struct vm_area_struct *vma; struct vm_region *region; struct rb_node *p; - int size = kobjsize(mm); + unsigned long size = kobjsize(mm); down_read(&mm->mmap_sem); for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { @@ -124,26 +126,30 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, /* * display a single VMA to a sequenced file */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, + int is_pid) { + struct mm_struct *mm = vma->vm_mm; + struct proc_maps_private *priv = m->private; unsigned long ino = 0; struct file *file; dev_t dev = 0; - int flags, len; + int flags; unsigned long long pgoff = 0; flags = vma->vm_flags; file = vma->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, - "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', @@ -151,14 +157,26 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', pgoff, - MAJOR(dev), MINOR(dev), ino, &len); + MAJOR(dev), MINOR(dev), ino); if (file) { - len = 25 + sizeof(void *) * 6 - len; - if (len < 1) - len = 1; - seq_printf(m, "%*c", len, ' '); + seq_pad(m, ' '); seq_path(m, &file->f_path, ""); + } else if (mm) { + pid_t tid = vm_is_stack(priv->task, vma, is_pid); + + if (tid != 0) { + seq_pad(m, ' '); + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, "[stack]"); + else + seq_printf(m, "[stack:%d]", tid); + } } seq_putc(m, '\n'); @@ -168,11 +186,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_p) +static int show_map(struct seq_file *m, void *_p, int is_pid) { struct rb_node *p = _p; - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), + is_pid); +} + +static int show_pid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 1); +} + +static int show_tid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 0); } static void *m_start(struct seq_file *m, loff_t *pos) @@ -185,13 +214,13 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* pin the task and mm whilst we play with them */ priv->task = get_pid_task(priv->pid, PIDTYPE_PID); if (!priv->task) - return NULL; + return ERR_PTR(-ESRCH); - mm = mm_for_maps(priv->task); - if (!mm) { + mm = mm_access(priv->task, PTRACE_MODE_READ); + if (!mm || IS_ERR(mm)) { put_task_struct(priv->task); priv->task = NULL; - return NULL; + return mm; } down_read(&mm->mmap_sem); @@ -226,10 +255,18 @@ static const struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_map + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map }; -static int maps_open(struct inode *inode, struct file *file) +static int maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { struct proc_maps_private *priv; int ret = -ENOMEM; @@ -237,7 +274,7 @@ static int maps_open(struct inode *inode, struct file *file) priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { priv->pid = proc_pid(inode); - ret = seq_open(file, &proc_pid_maps_ops); + ret = seq_open(file, ops); if (!ret) { struct seq_file *m = file->private_data; m->private = priv; @@ -248,8 +285,25 @@ static int maps_open(struct inode *inode, struct file *file) return ret; } -const struct file_operations proc_maps_operations = { - .open = maps_open, +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_pid_maps_ops); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_tid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d45605..33de567c25a 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,21 +5,25 @@ #include <linux/seq_file.h> #include <linux/time.h> #include <linux/kernel_stat.h> -#include <asm/cputime.h> +#include <linux/cputime.h> static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = cputime_zero; + idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - do_posix_clock_monotonic_gettime(&uptime); - monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + get_monotonic_boottime(&uptime); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), @@ -45,4 +49,4 @@ static int __init proc_uptime_init(void) proc_create("uptime", 0, NULL, &uptime_proc_fops); return 0; } -module_init(proc_uptime_init); +fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 76817a60678..d2154eb6d78 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -31,4 +31,4 @@ static int __init proc_version_init(void) proc_create("version", 0, NULL, &version_proc_fops); return 0; } -module_init(proc_version_init); +fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0872afa58d3..382aa890e22 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,17 +8,23 @@ */ #include <linux/mm.h> -#include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/export.h> +#include <linux/slab.h> #include <linux/highmem.h> +#include <linux/printk.h> #include <linux/bootmem.h> #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" /* List representing chunks of contiguous memory areas and their offsets in * vmcore file. @@ -28,11 +34,55 @@ static LIST_HEAD(vmcore_list); /* Stores the pointer to the buffer containing kernel elf core headers. */ static char *elfcorebuf; static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; + +static char *elfnotes_buf; +static size_t elfnotes_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -static struct proc_dir_entry *proc_vmcore = NULL; +static struct proc_dir_entry *proc_vmcore; + +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (oldmem_pfn_is_ram) + return -EBUSY; + oldmem_pfn_is_ram = fn; + return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ + oldmem_pfn_is_ram = NULL; + wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ + int (*fn)(unsigned long pfn); + /* pfn is ram unless fn() checks pagetype */ + int ret = 1; + + /* + * Ask hypervisor if the pfn is really ram. + * A ballooned page contains no data and reading from such a page + * will cause high load in the hypervisor. + */ + fn = oldmem_pfn_is_ram; + if (fn) + ret = fn(pfn); + + return ret; +} /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, @@ -54,9 +104,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, else nr_bytes = count; - tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) - return tmp; + /* If pfn is not ram, return zeros for sparse dump files */ + if (pfn_is_ram(pfn) == 0) + memset(buf, 0, nr_bytes); + else { + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) + return tmp; + } *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; @@ -68,37 +124,70 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } -/* Maps vmcore file offset to respective physical address in memroy. */ -static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, - struct vmcore **m_ptr) +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { - struct vmcore *m; - u64 paddr; + return 0; +} - list_for_each_entry(m, vc_list, list) { - u64 start, end; - start = m->offset; - end = m->offset + m->size - 1; - if (offset >= start && offset <= end) { - paddr = m->paddr + offset - start; - *m_ptr = m; - return paddr; - } +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to map oldmem + */ +int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Copy to either kernel or user space + */ +static int copy_to(void *target, void *src, size_t size, int userbuf) +{ + if (userbuf) { + if (copy_to_user((char __user *) target, src, size)) + return -EFAULT; + } else { + memcpy(target, src, size); } - *m_ptr = NULL; return 0; } /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, + int userbuf) { ssize_t acc = 0, tmp; size_t tsz; - u64 start, nr_bytes; - struct vmcore *curr_m = NULL; + u64 start; + struct vmcore *m = NULL; if (buflen == 0 || *fpos >= vmcore_size) return 0; @@ -109,10 +198,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = elfcorebuf_sz - *fpos; - if (buflen < tsz) - tsz = buflen; - if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); + if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) return -EFAULT; buflen -= tsz; *fpos += tsz; @@ -124,146 +211,395 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } - start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); - if (!curr_m) - return -EINVAL; - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; - - while (buflen) { - tmp = read_from_oldmem(buffer, tsz, &start, 1); - if (tmp < 0) - return tmp; + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + if (copy_to(buffer, kaddr, tsz, userbuf)) + return -EFAULT; buflen -= tsz; *fpos += tsz; buffer += tsz; acc += tsz; - if (start >= (curr_m->paddr + curr_m->size)) { - if (curr_m->list.next == &vmcore_list) - return acc; /*EOF*/ - curr_m = list_entry(curr_m->list.next, - struct vmcore, list); - start = curr_m->paddr; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; } - if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) - tsz = buflen; - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; } + return acc; } -static const struct file_operations proc_vmcore_operations = { - .read = read_vmcore, +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + return __read_vmcore((__force char *) buffer, buflen, fpos, 1); +} + +/* + * The vmcore fault handler uses the page cache and fills data using the + * standard __vmcore_read() function. + * + * On s390 the fault handler is used for memory regions that can't be mapped + * directly with remap_pfn_range(). + */ +static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#ifdef CONFIG_S390 + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t index = vmf->pgoff; + struct page *page; + loff_t offset; + char *buf; + int rc; + + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (!page) + return VM_FAULT_OOM; + if (!PageUptodate(page)) { + offset = (loff_t) index << PAGE_CACHE_SHIFT; + buf = __va((page_to_pfn(page) << PAGE_SHIFT)); + rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + if (rc < 0) { + unlock_page(page); + page_cache_release(page); + return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + } + SetPageUptodate(page); + } + unlock_page(page); + vmf->page = page; + return 0; +#else + return VM_FAULT_SIGBUS; +#endif +} + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, }; -static struct vmcore* __init get_new_element(void) +/** + * alloc_elfnotes_buf - allocate buffer for ELF note segment in + * vmalloc memory + * + * @notes_sz: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *alloc_elfnotes_buf(size_t notes_sz) { - return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +#ifdef CONFIG_MMU + return vmalloc_user(notes_sz); +#else + return vzalloc(notes_sz); +#endif } -static u64 __init get_vmcore_size_elf64(char *elfptr) +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { - int i; - u64 size; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr *phdr_ptr; + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); - size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &vmcore_mmap_ops; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; } - return size; + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, tsz)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = min_t(size_t, m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (remap_oldmem_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; } +#endif + +static const struct file_operations proc_vmcore_operations = { + .read = read_vmcore, + .llseek = default_llseek, + .mmap = mmap_vmcore, +}; -static u64 __init get_vmcore_size_elf32(char *elfptr) +static struct vmcore* __init get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { - int i; u64 size; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr *phdr_ptr; + struct vmcore *m; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); - size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; } return size; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf64_Phdr *phdr_ptr; Elf64_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf64_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -277,6 +613,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -284,67 +622,170 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return 0; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf32_Phdr *phdr_ptr; Elf32_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { - if (nhdr_ptr->n_namesz == 0) - break; + while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + ((nhdr_ptr->n_namesz + 3) & ~3) + ((nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf32_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -358,6 +799,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -369,6 +812,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, * the new offset fields of exported program headers. */ static int __init process_ptload_program_headers_elf64(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -380,32 +824,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset. */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } static int __init process_ptload_program_headers_elf32(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -417,43 +867,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf64(char *elfptr, - struct list_head *vc_list) +static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; - Elf64_Ehdr *ehdr_ptr; struct vmcore *m; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { m->offset = vmcore_off; @@ -461,24 +912,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr, } } -/* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf32(char *elfptr, - struct list_head *vc_list) +static void free_elfcorebuf(void) { - loff_t vmcore_off; - Elf32_Ehdr *ehdr_ptr; - struct vmcore *m; - - ehdr_ptr = (Elf32_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); - - list_for_each_entry(m, vc_list, list) { - m->offset = vmcore_off; - vmcore_off += m->size; - } + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; } static int __init parse_crash_elf64_headers(void) @@ -490,51 +929,51 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !vmcore_elf_check_arch(&ehdr) || + !vmcore_elf64_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS64 || ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || ehdr.e_ehsize != sizeof(Elf64_Ehdr) || ehdr.e_phentsize != sizeof(Elf64_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf32_headers(void) @@ -546,7 +985,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -560,37 +999,36 @@ static int __init parse_crash_elf32_headers(void) ehdr.e_ehsize != sizeof(Elf32_Ehdr) || ehdr.e_phentsize != sizeof(Elf32_Phdr) || ehdr.e_phnum == 0) { - printk(KERN_WARNING "Warning: Core image elf header is not" - "sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf_headers(void) @@ -600,12 +1038,11 @@ static int __init parse_crash_elf_headers(void) int rc=0; addr = elfcorehdr_addr; - rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr); if (rc < 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { - printk(KERN_WARNING "Warning: Core image elf header" - " not found\n"); + pr_warn("Warning: Core image elf header not found\n"); return -EINVAL; } @@ -613,21 +1050,19 @@ static int __init parse_crash_elf_headers(void) rc = parse_crash_elf64_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf64(elfcorebuf); } else if (e_ident[EI_CLASS] == ELFCLASS32) { rc = parse_crash_elf32_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf32(elfcorebuf); } else { - printk(KERN_WARNING "Warning: Core image elf header is not" - " sane\n"); + pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + return 0; } @@ -636,18 +1071,48 @@ static int __init vmcore_init(void) { int rc = 0; - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size); + if (rc) + return rc; + /* + * If elfcorehdr= has been passed in cmdline or created in 2nd kernel, + * then capture the dump. + */ if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); if (rc) { - printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + pr_warn("Kdump: vmcore not initialized\n"); return rc; } + elfcorehdr_free(elfcorehdr_addr); + elfcorehdr_addr = ELFCORE_ADDR_ERR; proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; } -module_init(vmcore_init) +fs_initcall(vmcore_init); + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ + struct list_head *pos, *next; + + if (proc_vmcore) { + proc_remove(proc_vmcore); + proc_vmcore = NULL; + } + + /* clear the vmcore list. */ + list_for_each_safe(pos, next, &vmcore_list) { + struct vmcore *m; + + m = list_entry(pos, struct vmcore, list); + list_del(&m->list); + kfree(m); + } + free_elfcorebuf(); +} |
