aboutsummaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig14
-rw-r--r--fs/proc/Makefile11
-rw-r--r--fs/proc/array.c375
-rw-r--r--fs/proc/base.c2458
-rw-r--r--fs/proc/cmdline.c2
-rw-r--r--fs/proc/consoles.c112
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/devices.c6
-rw-r--r--fs/proc/fd.c351
-rw-r--r--fs/proc/fd.h19
-rw-r--r--fs/proc/generic.c727
-rw-r--r--fs/proc/inode.c426
-rw-r--r--fs/proc/internal.h318
-rw-r--r--fs/proc/interrupts.c2
-rw-r--r--fs/proc/kcore.c37
-rw-r--r--fs/proc/kmsg.c17
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/meminfo.c62
-rw-r--r--fs/proc/mmu.c60
-rw-r--r--fs/proc/namespaces.c297
-rw-r--r--fs/proc/nommu.c17
-rw-r--r--fs/proc/page.c29
-rw-r--r--fs/proc/proc_devtree.c234
-rw-r--r--fs/proc/proc_net.c34
-rw-r--r--fs/proc/proc_sysctl.c1420
-rw-r--r--fs/proc/proc_tty.c26
-rw-r--r--fs/proc/root.c179
-rw-r--r--fs/proc/self.c84
-rw-r--r--fs/proc/softirqs.c10
-rw-r--r--fs/proc/stat.c211
-rw-r--r--fs/proc/task_mmu.c1177
-rw-r--r--fs/proc/task_nommu.c100
-rw-r--r--fs/proc/uptime.c18
-rw-r--r--fs/proc/version.c2
-rw-r--r--fs/proc/vmcore.c939
35 files changed, 6445 insertions, 3333 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 50f8f0600f0..2183fcf41d5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
config PROC_FS
- bool "/proc file system support" if EMBEDDED
+ bool "/proc file system support" if EXPERT
default y
help
This is a virtual file system providing information about the status
@@ -31,16 +31,20 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+ help
+ Provides a virtual ELF core file of the live kernel. This can
+ be read with gdb and other ELF tools. No modifications can be
+ made using this mechanism.
config PROC_VMCORE
- bool "/proc/vmcore support (EXPERIMENTAL)"
- depends on PROC_FS && CRASH_DUMP
+ bool "/proc/vmcore support"
+ depends on PROC_FS && CRASH_DUMP
default y
help
Exports the dump image of crashed kernel in ELF format.
config PROC_SYSCTL
- bool "Sysctl support (/proc/sys)" if EMBEDDED
+ bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
select SYSCTL
default y
@@ -61,7 +65,7 @@ config PROC_SYSCTL
config PROC_PAGE_MONITOR
default y
depends on PROC_FS && MMU
- bool "Enable /proc page monitoring" if EMBEDDED
+ bool "Enable /proc page monitoring" if EXPERT
help
Various /proc files exist to monitor process memory utilization:
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c6815..239493ec718 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,14 +2,16 @@
# Makefile for the Linux proc filesystem routines.
#
-obj-$(CONFIG_PROC_FS) += proc.o
+obj-y += proc.o
proc-y := nommu.o task_nommu.o
-proc-$(CONFIG_MMU) := mmu.o task_mmu.o
+proc-$(CONFIG_MMU) := task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
- proc_tty.o
+ fd.o
+proc-$(CONFIG_TTY) += proc_tty.o
proc-y += cmdline.o
+proc-y += consoles.o
proc-y += cpuinfo.o
proc-y += devices.o
proc-y += interrupts.o
@@ -19,10 +21,11 @@ proc-y += stat.o
proc-y += uptime.o
proc-y += version.o
proc-y += softirqs.o
+proc-y += namespaces.o
+proc-y += self.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
-proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 13b5d070817..64db2bceac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/signal.h>
#include <linux/highmem.h>
@@ -82,7 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
-#include <linux/swapops.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -97,7 +96,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
get_task_comm(tcomm, p);
- seq_printf(m, "Name:\t");
+ seq_puts(m, "Name:\t");
end = m->buf + m->size;
buf = m->buf + m->count;
name = tcomm;
@@ -124,7 +123,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
buf++;
}
m->count = buf - m->buf;
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
/*
@@ -133,36 +132,29 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
* you can test for combinations of others with
* simple bit tests.
*/
-static const char *task_state_array[] = {
+static const char * const task_state_array[] = {
"R (running)", /* 0 */
"S (sleeping)", /* 1 */
"D (disk sleep)", /* 2 */
"T (stopped)", /* 4 */
"t (tracing stop)", /* 8 */
- "Z (zombie)", /* 16 */
- "X (dead)", /* 32 */
- "x (dead)", /* 64 */
- "K (wakekill)", /* 128 */
- "W (waking)", /* 256 */
+ "X (dead)", /* 16 */
+ "Z (zombie)", /* 32 */
};
static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
- const char **p = &task_state_array[0];
+ unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
- BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
- while (state) {
- p++;
- state >>= 1;
- }
- return *p;
+ return task_state_array[fls(state)];
}
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
+ struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
int g;
struct fdtable *fdt = NULL;
@@ -174,14 +166,15 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
tpid = 0;
if (pid_alive(p)) {
- struct task_struct *tracer = tracehook_tracer_task(p);
+ struct task_struct *tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
}
- cred = get_cred((struct cred *) __task_cred(p));
+ cred = get_task_cred(p);
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
+ "Ngid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
@@ -189,10 +182,17 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
task_tgid_nr_ns(p, ns),
+ task_numa_group_id(p),
pid_nr_ns(pid, ns),
ppid, tpid,
- cred->uid, cred->euid, cred->suid, cred->fsuid,
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kuid_munged(user_ns, cred->uid),
+ from_kuid_munged(user_ns, cred->euid),
+ from_kuid_munged(user_ns, cred->suid),
+ from_kuid_munged(user_ns, cred->fsuid),
+ from_kgid_munged(user_ns, cred->gid),
+ from_kgid_munged(user_ns, cred->egid),
+ from_kgid_munged(user_ns, cred->sgid),
+ from_kgid_munged(user_ns, cred->fsgid));
task_lock(p);
if (p->files)
@@ -206,19 +206,20 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
group_info = cred->group_info;
task_unlock(p);
- for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
- seq_printf(m, "%d ", GROUP_AT(group_info, g));
+ for (g = 0; g < group_info->ngroups; g++)
+ seq_printf(m, "%d ",
+ from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
put_cred(cred);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
-static void render_sigset_t(struct seq_file *m, const char *header,
+void render_sigset_t(struct seq_file *m, const char *header,
sigset_t *set)
{
int i;
- seq_printf(m, "%s", header);
+ seq_puts(m, header);
i = _NSIG;
do {
@@ -232,7 +233,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
seq_printf(m, "%x", x);
} while (i >= 4);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -269,9 +270,11 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
shpending = p->signal->shared_pending.signal;
blocked = p->blocked;
collect_sigign_sigcatch(p, &ignored, &caught);
- num_threads = atomic_read(&p->signal->count);
+ num_threads = get_nr_threads(p);
+ rcu_read_lock(); /* FIXME: is this correct? */
qsize = atomic_read(&__task_cred(p)->user->sigpending);
- qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
+ rcu_read_unlock();
+ qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
}
@@ -291,14 +294,18 @@ static void render_cap_t(struct seq_file *m, const char *header,
{
unsigned __capi;
- seq_printf(m, "%s", header);
+ seq_puts(m, header);
CAP_FOR_EACH_U32(__capi) {
seq_printf(m, "%08x",
a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
+/* Remove non-existent capabilities */
+#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
+ CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
+
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
@@ -312,12 +319,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_bset = cred->cap_bset;
rcu_read_unlock();
+ NORM_CAPS(cap_inheritable);
+ NORM_CAPS(cap_permitted);
+ NORM_CAPS(cap_effective);
+ NORM_CAPS(cap_bset);
+
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
}
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+#endif
+}
+
static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
{
@@ -329,12 +348,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Cpus_allowed:\t");
+ seq_puts(m, "Cpus_allowed:\t");
seq_cpumask(m, &task->cpus_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Cpus_allowed_list:\t");
+ seq_putc(m, '\n');
+ seq_puts(m, "Cpus_allowed_list:\t");
seq_cpumask_list(m, &task->cpus_allowed);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -351,11 +370,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
}
task_sig(m, task);
task_cap(m, task);
+ task_seccomp(m, task);
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
-#if defined(CONFIG_S390)
- task_show_regs(m, task);
-#endif
task_context_switch_counts(m, task);
return 0;
}
@@ -364,7 +381,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
- long priority, nice;
+ int priority, nice;
int tty_pgrp = -1, tty_nr = 0;
sigset_t sigign, sigcatch;
char state;
@@ -383,7 +400,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
@@ -397,8 +414,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = cputime_zero;
- cgtime = gtime = cputime_zero;
+ cutime = cstime = utime = stime = 0;
+ cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
struct signal_struct *sig = task->signal;
@@ -410,7 +427,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
tty_nr = new_encode_dev(tty_devnum(sig->tty));
}
- num_threads = atomic_read(&sig->count);
+ num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
cmin_flt = sig->cmin_flt;
@@ -418,7 +435,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
cutime = sig->cutime;
cstime = sig->cstime;
cgtime = sig->cgtime;
- rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+ rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
/* add up live thread stats at the group level */
if (whole) {
@@ -426,14 +443,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime = cputime_add(gtime, t->gtime);
- t = next_thread(t);
- } while (t != task);
+ gtime += task_gtime(t);
+ } while_each_thread(task, t);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_times(task, &utime, &stime);
- gtime = cputime_add(gtime, sig->gtime);
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ gtime += sig->gtime;
}
sid = task_session_nr_ns(task, ns);
@@ -448,8 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_times(task, &utime, &stime);
- gtime = task->gtime;
+ task_cputime_adjusted(task, &utime, &stime);
+ gtime = task_gtime(task);
}
/* scale priority and nice values from timeslices to -20..20 */
@@ -465,56 +481,70 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
- seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
- pid_nr_ns(pid, ns),
- tcomm,
- state,
- ppid,
- pgid,
- sid,
- tty_nr,
- tty_pgrp,
- task->flags,
- min_flt,
- cmin_flt,
- maj_flt,
- cmaj_flt,
- cputime_to_clock_t(utime),
- cputime_to_clock_t(stime),
- cputime_to_clock_t(cutime),
- cputime_to_clock_t(cstime),
- priority,
- nice,
- num_threads,
- start_time,
- vsize,
- mm ? get_mm_rss(mm) : 0,
- rsslim,
- mm ? mm->start_code : 0,
- mm ? mm->end_code : 0,
- (permitted && mm) ? task->stack_start : 0,
- esp,
- eip,
- /* The signal information here is obsolete.
- * It must be decimal for Linux 2.0 compatibility.
- * Use /proc/#/status for real-time signals.
- */
- task->pending.signal.sig[0] & 0x7fffffffUL,
- task->blocked.sig[0] & 0x7fffffffUL,
- sigign .sig[0] & 0x7fffffffUL,
- sigcatch .sig[0] & 0x7fffffffUL,
- wchan,
- 0UL,
- 0UL,
- task->exit_signal,
- task_cpu(task),
- task->rt_priority,
- task->policy,
- (unsigned long long)delayacct_blkio_ticks(task),
- cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+ seq_put_decimal_ll(m, ' ', ppid);
+ seq_put_decimal_ll(m, ' ', pgid);
+ seq_put_decimal_ll(m, ' ', sid);
+ seq_put_decimal_ll(m, ' ', tty_nr);
+ seq_put_decimal_ll(m, ' ', tty_pgrp);
+ seq_put_decimal_ull(m, ' ', task->flags);
+ seq_put_decimal_ull(m, ' ', min_flt);
+ seq_put_decimal_ull(m, ' ', cmin_flt);
+ seq_put_decimal_ull(m, ' ', maj_flt);
+ seq_put_decimal_ull(m, ' ', cmaj_flt);
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
+ seq_put_decimal_ll(m, ' ', priority);
+ seq_put_decimal_ll(m, ' ', nice);
+ seq_put_decimal_ll(m, ' ', num_threads);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', start_time);
+ seq_put_decimal_ull(m, ' ', vsize);
+ seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, ' ', rsslim);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, ' ', esp);
+ seq_put_decimal_ull(m, ' ', eip);
+ /* The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', wchan);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ll(m, ' ', task->exit_signal);
+ seq_put_decimal_ll(m, ' ', task_cpu(task));
+ seq_put_decimal_ull(m, ' ', task->rt_priority);
+ seq_put_decimal_ull(m, ' ', task->policy);
+ seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+
+ if (mm && permitted) {
+ seq_put_decimal_ull(m, ' ', mm->start_data);
+ seq_put_decimal_ull(m, ' ', mm->end_data);
+ seq_put_decimal_ull(m, ' ', mm->start_brk);
+ seq_put_decimal_ull(m, ' ', mm->arg_start);
+ seq_put_decimal_ull(m, ' ', mm->arg_end);
+ seq_put_decimal_ull(m, ' ', mm->env_start);
+ seq_put_decimal_ull(m, ' ', mm->env_end);
+ } else
+ seq_printf(m, " 0 0 0 0 0 0 0");
+
+ if (permitted)
+ seq_put_decimal_ll(m, ' ', task->exit_code);
+ else
+ seq_put_decimal_ll(m, ' ', 0);
+
+ seq_putc(m, '\n');
if (mm)
mmput(mm);
return 0;
@@ -535,15 +565,150 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+ unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
- seq_printf(m, "%d %d %d %d %d %d %d\n",
- size, resident, shared, text, lib, data, 0);
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, 0, size);
+ seq_put_decimal_ull(m, ' ', resident);
+ seq_put_decimal_ull(m, ' ', shared);
+ seq_put_decimal_ull(m, ' ', text);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', data);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static struct pid *
+get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+{
+ struct task_struct *start, *task;
+ struct pid *pid = NULL;
+
+ read_lock(&tasklist_lock);
+
+ start = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!start)
+ goto out;
+
+ /*
+ * Lets try to continue searching first, this gives
+ * us significant speedup on children-rich processes.
+ */
+ if (pid_prev) {
+ task = pid_task(pid_prev, PIDTYPE_PID);
+ if (task && task->real_parent == start &&
+ !(list_empty(&task->sibling))) {
+ if (list_is_last(&task->sibling, &start->children))
+ goto out;
+ task = list_first_entry(&task->sibling,
+ struct task_struct, sibling);
+ pid = get_pid(task_pid(task));
+ goto out;
+ }
+ }
+
+ /*
+ * Slow search case.
+ *
+ * We might miss some children here if children
+ * are exited while we were not holding the lock,
+ * but it was never promised to be accurate that
+ * much.
+ *
+ * "Just suppose that the parent sleeps, but N children
+ * exit after we printed their tids. Now the slow paths
+ * skips N extra children, we miss N tasks." (c)
+ *
+ * So one need to stop or freeze the leader and all
+ * its children to get a precise result.
+ */
+ list_for_each_entry(task, &start->children, sibling) {
+ if (pos-- == 0) {
+ pid = get_pid(task_pid(task));
+ break;
+ }
+ }
+
+out:
+ read_unlock(&tasklist_lock);
+ return pid;
+}
+
+static int children_seq_show(struct seq_file *seq, void *v)
+{
+ struct inode *inode = seq->private;
+ pid_t pid;
+ pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
+ return seq_printf(seq, "%d ", pid);
+}
+
+static void *children_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return get_children_pid(seq->private, NULL, *pos);
+}
+
+static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct pid *pid;
+
+ pid = get_children_pid(seq->private, v, *pos + 1);
+ put_pid(v);
+
+ ++*pos;
+ return pid;
+}
+
+static void children_seq_stop(struct seq_file *seq, void *v)
+{
+ put_pid(v);
+}
+
+static const struct seq_operations children_seq_ops = {
+ .start = children_seq_start,
+ .next = children_seq_next,
+ .stop = children_seq_stop,
+ .show = children_seq_show,
+};
+
+static int children_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &children_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = inode;
+
+ return ret;
+}
+
+int children_seq_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
return 0;
}
+
+const struct file_operations proc_tid_children_operations = {
+ .open = children_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = children_seq_release,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18d5cc62d8e..2d696b0c93b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
@@ -72,6 +73,7 @@
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/printk.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
@@ -80,8 +82,17 @@
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/flex_array.h>
+#include <linux/posix-timers.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
+#include <trace/events/oom.h>
#include "internal.h"
+#include "fd.h"
/* NOTE:
* Implementing inode permission operations in /proc is almost
@@ -96,7 +107,7 @@
struct pid_entry {
char *name;
int len;
- mode_t mode;
+ umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
union proc_op op;
@@ -147,151 +158,58 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
return count;
}
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
{
- struct fs_struct *fs;
int result = -ENOENT;
task_lock(task);
- fs = task->fs;
- if (fs) {
- read_lock(&fs->lock);
- *path = root ? fs->root : fs->pwd;
- path_get(path);
- read_unlock(&fs->lock);
+ if (task->fs) {
+ get_fs_root(task->fs, root);
result = 0;
}
task_unlock(task);
return result;
}
-static int get_nr_threads(struct task_struct *tsk)
-{
- unsigned long flags;
- int count = 0;
-
- if (lock_task_sighand(tsk, &flags)) {
- count = atomic_read(&tsk->signal->count);
- unlock_task_sighand(tsk, &flags);
- }
- return count;
-}
-
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
- result = get_fs_path(task, path, 0);
+ task_lock(task);
+ if (task->fs) {
+ get_fs_pwd(task->fs, path);
+ result = 0;
+ }
+ task_unlock(task);
put_task_struct(task);
}
return result;
}
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
- result = get_fs_path(task, path, 1);
+ result = get_task_root(task, path);
put_task_struct(task);
}
return result;
}
-/*
- * Return zero if current may access user memory in @task, -error if not.
- */
-static int check_mem_permission(struct task_struct *task)
+static int proc_pid_cmdline(struct task_struct *task, char *buffer)
{
- /*
- * A task can always look at itself, in case it chooses
- * to use system calls instead of load instructions.
- */
- if (task == current)
- return 0;
-
- /*
- * If current is actively ptrace'ing, and would also be
- * permitted to freshly attach with ptrace now, permit it.
- */
- if (task_is_stopped_or_traced(task)) {
- int match;
- rcu_read_lock();
- match = (tracehook_tracer_task(task) == current);
- rcu_read_unlock();
- if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
- return 0;
- }
-
- /*
- * Noone else is allowed.
- */
- return -EPERM;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
-{
- struct mm_struct *mm;
-
- if (mutex_lock_killable(&task->cred_guard_mutex))
- return NULL;
-
- mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, PTRACE_MODE_READ)) {
- mmput(mm);
- mm = NULL;
- }
- mutex_unlock(&task->cred_guard_mutex);
-
- return mm;
-}
-
-static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-{
- int res = 0;
- unsigned int len;
- struct mm_struct *mm = get_task_mm(task);
- if (!mm)
- goto out;
- if (!mm->arg_end)
- goto out_mm; /* Shh! No looking before we're done */
-
- len = mm->arg_end - mm->arg_start;
-
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
-
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
-
- // If the nul at the end of args has been overwritten, then
- // assume application is using setproctitle(3).
- if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
- len = strnlen(buffer, res);
- if (len < res) {
- res = len;
- } else {
- len = mm->env_end - mm->env_start;
- if (len > PAGE_SIZE - res)
- len = PAGE_SIZE - res;
- res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
- res = strnlen(buffer, res);
- }
- }
-out_mm:
- mmput(mm);
-out:
- return res;
+ return get_cmdline(task, buffer, PAGE_SIZE);
}
static int proc_pid_auxv(struct task_struct *task, char *buffer)
{
- int res = 0;
- struct mm_struct *mm = get_task_mm(task);
- if (mm) {
+ struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+ int res = PTR_ERR(mm);
+ if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
nwords += 2;
@@ -328,6 +246,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_KALLSYMS */
+static int lock_trace(struct task_struct *task)
+{
+ int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (err)
+ return err;
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ return -EPERM;
+ }
+ return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+ mutex_unlock(&task->signal->cred_guard_mutex);
+}
+
#ifdef CONFIG_STACKTRACE
#define MAX_STACK_TRACE_DEPTH 64
@@ -337,6 +272,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
{
struct stack_trace trace;
unsigned long *entries;
+ int err;
int i;
entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -347,15 +283,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
trace.max_entries = MAX_STACK_TRACE_DEPTH;
trace.entries = entries;
trace.skip = 0;
- save_stack_trace_tsk(task, &trace);
- for (i = 0; i < trace.nr_entries; i++) {
- seq_printf(m, "[<%p>] %pS\n",
- (void *)entries[i], (void *)entries[i]);
+ err = lock_trace(task);
+ if (!err) {
+ save_stack_trace_tsk(task, &trace);
+
+ for (i = 0; i < trace.nr_entries; i++) {
+ seq_printf(m, "[<%pK>] %pS\n",
+ (void *)entries[i], (void *)entries[i]);
+ }
+ unlock_trace(task);
}
kfree(entries);
- return 0;
+ return err;
}
#endif
@@ -383,26 +324,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
return -ESRCH;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < 32; i++) {
- if (task->latency_record[i].backtrace[0]) {
+ struct latency_record *lr = &task->latency_record[i];
+ if (lr->backtrace[0]) {
int q;
- seq_printf(m, "%i %li %li ",
- task->latency_record[i].count,
- task->latency_record[i].time,
- task->latency_record[i].max);
+ seq_printf(m, "%i %li %li",
+ lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
- char sym[KSYM_SYMBOL_LEN];
- char *c;
- if (!task->latency_record[i].backtrace[q])
+ unsigned long bt = lr->backtrace[q];
+ if (!bt)
break;
- if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+ if (bt == ULONG_MAX)
break;
- sprint_symbol(sym, task->latency_record[i].backtrace[q]);
- c = strchr(sym, '+');
- if (c)
- *c = 0;
- seq_printf(m, "%s ", sym);
+ seq_printf(m, " %ps", (void *)bt);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
}
@@ -418,7 +353,7 @@ static int lstats_open(struct inode *inode, struct file *file)
static ssize_t lstats_write(struct file *file, const char __user *buf,
size_t count, loff_t *offs)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -438,16 +373,46 @@ static const struct file_operations proc_lstats_operations = {
#endif
-/* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
+#ifdef CONFIG_CGROUPS
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cgroup_show, pid);
+}
+
+static const struct file_operations proc_cgroup_operations = {
+ .open = cgroup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+#ifdef CONFIG_PROC_PID_CPUSET
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cpuset_show, pid);
+}
+
+static const struct file_operations proc_cpuset_operations = {
+ .open = cpuset_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static int proc_oom_score(struct task_struct *task, char *buffer)
{
- unsigned long points;
- struct timespec uptime;
+ unsigned long totalpages = totalram_pages + total_swap_pages;
+ unsigned long points = 0;
- do_posix_clock_monotonic_gettime(&uptime);
read_lock(&tasklist_lock);
- points = badness(task->group_leader, uptime.tv_sec);
+ if (pid_alive(task))
+ points = oom_badness(task, NULL, NULL, totalpages) *
+ 1000 / totalpages;
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -526,18 +491,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
{
long nr;
unsigned long args[6], sp, pc;
+ int res = lock_trace(task);
+ if (res)
+ return res;
if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
- return sprintf(buffer, "running\n");
-
- if (nr < 0)
- return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
-
- return sprintf(buffer,
+ res = sprintf(buffer, "running\n");
+ else if (nr < 0)
+ res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+ else
+ res = sprintf(buffer,
"%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
nr,
args[0], args[1], args[2], args[3], args[4], args[5],
sp, pc);
+ unlock_trace(task);
+ return res;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
@@ -562,7 +531,7 @@ static int proc_fd_access_allowed(struct inode *inode)
return allowed;
}
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
struct inode *inode = dentry->d_inode;
@@ -571,133 +540,62 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
return -EPERM;
error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
- return error;
-}
-
-static const struct inode_operations proc_def_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static int mounts_open_common(struct inode *inode, struct file *file,
- const struct seq_operations *op)
-{
- struct task_struct *task = get_proc_task(inode);
- struct nsproxy *nsp;
- struct mnt_namespace *ns = NULL;
- struct path root;
- struct proc_mounts *p;
- int ret = -EINVAL;
-
- if (task) {
- rcu_read_lock();
- nsp = task_nsproxy(task);
- if (nsp) {
- ns = nsp->mnt_ns;
- if (ns)
- get_mnt_ns(ns);
- }
- rcu_read_unlock();
- if (ns && get_fs_path(task, &root, 1) == 0)
- ret = 0;
- put_task_struct(task);
- }
-
- if (!ns)
- goto err;
- if (ret)
- goto err_put_ns;
-
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (!p)
- goto err_put_path;
-
- file->private_data = &p->m;
- ret = seq_open(file, op);
- if (ret)
- goto err_free;
-
- p->m.private = p;
- p->ns = ns;
- p->root = root;
- p->event = ns->event;
+ if (error)
+ return error;
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
return 0;
-
- err_free:
- kfree(p);
- err_put_path:
- path_put(&root);
- err_put_ns:
- put_mnt_ns(ns);
- err:
- return ret;
}
-static int mounts_release(struct inode *inode, struct file *file)
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+ struct task_struct *task,
+ int hide_pid_min)
{
- struct proc_mounts *p = file->private_data;
- path_put(&p->root);
- put_mnt_ns(p->ns);
- return seq_release(inode, file);
+ if (pid->hide_pid < hide_pid_min)
+ return true;
+ if (in_group_p(pid->pid_gid))
+ return true;
+ return ptrace_may_access(task, PTRACE_MODE_READ);
}
-static unsigned mounts_poll(struct file *file, poll_table *wait)
-{
- struct proc_mounts *p = file->private_data;
- struct mnt_namespace *ns = p->ns;
- unsigned res = POLLIN | POLLRDNORM;
-
- poll_wait(file, &ns->poll, wait);
-
- spin_lock(&vfsmount_lock);
- if (p->event != ns->event) {
- p->event = ns->event;
- res |= POLLERR | POLLPRI;
- }
- spin_unlock(&vfsmount_lock);
- return res;
-}
-
-static int mounts_open(struct inode *inode, struct file *file)
+static int proc_pid_permission(struct inode *inode, int mask)
{
- return mounts_open_common(inode, file, &mounts_op);
-}
+ struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct task_struct *task;
+ bool has_perms;
-static const struct file_operations proc_mounts_operations = {
- .open = mounts_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ has_perms = has_pid_permissions(pid, task, 1);
+ put_task_struct(task);
-static int mountinfo_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountinfo_op);
+ if (!has_perms) {
+ if (pid->hide_pid == 2) {
+ /*
+ * Let's make getdents(), stat(), and open()
+ * consistent with each other. If a process
+ * may not stat() a file, it shouldn't be seen
+ * in procfs at all.
+ */
+ return -ENOENT;
+ }
+
+ return -EPERM;
+ }
+ return generic_permission(inode, mask);
}
-static const struct file_operations proc_mountinfo_operations = {
- .open = mountinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
-static int mountstats_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountstats_op);
-}
-static const struct file_operations proc_mountstats_operations = {
- .open = mountstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
+static const struct inode_operations proc_def_inode_operations = {
+ .setattr = proc_setattr,
};
#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
@@ -705,7 +603,7 @@ static const struct file_operations proc_mountstats_operations = {
static ssize_t proc_info_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
unsigned long page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -734,6 +632,7 @@ out_no_task:
static const struct file_operations proc_info_file_operations = {
.read = proc_info_read,
+ .llseek = generic_file_llseek,
};
static int proc_single_show(struct seq_file *m, void *v)
@@ -758,14 +657,7 @@ static int proc_single_show(struct seq_file *m, void *v)
static int proc_single_open(struct inode *inode, struct file *filp)
{
- int ret;
- ret = single_open(filp, proc_single_show, NULL);
- if (!ret) {
- struct seq_file *m = filp->private_data;
-
- m->private = inode;
- }
- return ret;
+ return single_open(filp, proc_single_show, inode);
}
static const struct file_operations proc_single_file_operations = {
@@ -775,130 +667,105 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-static int mem_open(struct inode* inode, struct file* file)
-{
- file->private_data = (void*)((long)current->self_exec_id);
- return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char *page;
- unsigned long src = *ppos;
- int ret = -ESRCH;
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
if (!task)
- goto out_no_task;
+ return -ESRCH;
- if (check_mem_permission(task))
- goto out;
+ mm = mm_access(task, mode);
+ put_task_struct(task);
- ret = -ENOMEM;
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page)
- goto out;
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
- ret = 0;
-
- mm = get_task_mm(task);
- if (!mm)
- goto out_free;
+ if (mm) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
+ }
- ret = -EIO;
-
- if (file->private_data != (void*)((long)current->self_exec_id))
- goto out_put;
+ file->private_data = mm;
- ret = 0;
-
- while (count > 0) {
- int this_len, retval;
+ return 0;
+}
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- retval = access_process_vm(task, src, page, this_len, 0);
- if (!retval || check_mem_permission(task)) {
- if (!ret)
- ret = -EIO;
- break;
- }
+static int mem_open(struct inode *inode, struct file *file)
+{
+ int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
- if (copy_to_user(buf, page, retval)) {
- ret = -EFAULT;
- break;
- }
-
- ret += retval;
- src += retval;
- buf += retval;
- count -= retval;
- }
- *ppos = src;
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
-out_put:
- mmput(mm);
-out_free:
- free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
return ret;
}
-#define mem_write NULL
-
-#ifndef mem_write
-/* This is a security hazard */
-static ssize_t mem_write(struct file * file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- int copied;
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
char *page;
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- unsigned long dst = *ppos;
-
- copied = -ESRCH;
- if (!task)
- goto out_no_task;
- if (check_mem_permission(task))
- goto out;
+ if (!mm)
+ return 0;
- copied = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out;
+ return -ENOMEM;
copied = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
+
while (count > 0) {
- int this_len, retval;
+ int this_len = min_t(int, count, PAGE_SIZE);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- if (copy_from_user(page, buf, this_len)) {
+ if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
- retval = access_process_vm(task, dst, page, this_len, 1);
- if (!retval) {
+
+ this_len = access_remote_vm(mm, addr, page, this_len, write);
+ if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
- copied += retval;
- buf += retval;
- dst += retval;
- count -= retval;
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
}
- *ppos = dst;
+ *ppos = addr;
+
+ mmput(mm);
+free:
free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
return copied;
}
-#endif
+
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
@@ -916,51 +783,58 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
+ .release = mem_release,
};
+static int environ_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ);
+}
+
static ssize_t environ_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
char *page;
unsigned long src = *ppos;
- int ret = -ESRCH;
- struct mm_struct *mm;
-
- if (!task)
- goto out_no_task;
+ int ret = 0;
+ struct mm_struct *mm = file->private_data;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out;
+ if (!mm)
+ return 0;
- ret = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out;
+ return -ENOMEM;
ret = 0;
-
- mm = get_task_mm(task);
- if (!mm)
- goto out_free;
-
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
while (count > 0) {
- int this_len, retval, max_len;
+ size_t this_len, max_len;
+ int retval;
- this_len = mm->env_end - (mm->env_start + src);
-
- if (this_len <= 0)
+ if (src >= (mm->env_end - mm->env_start))
break;
- max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- this_len = (this_len > max_len) ? max_len : this_len;
+ this_len = mm->env_end - (mm->env_start + src);
+
+ max_len = min_t(size_t, PAGE_SIZE, count);
+ this_len = min(max_len, this_len);
- retval = access_process_vm(task, (mm->env_start + src),
+ retval = access_remote_vm(mm, (mm->env_start + src),
page, this_len, 0);
if (retval <= 0) {
@@ -979,91 +853,214 @@ static ssize_t environ_read(struct file *file, char __user *buf,
count -= retval;
}
*ppos = src;
-
mmput(mm);
-out_free:
+
+free:
free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
return ret;
}
static const struct file_operations proc_environ_operations = {
+ .open = environ_open,
.read = environ_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
};
-static ssize_t oom_adjust_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
+ int oom_adj = OOM_ADJUST_MIN;
size_t len;
- int oom_adjust = OOM_DISABLE;
unsigned long flags;
if (!task)
return -ESRCH;
-
if (lock_task_sighand(task, &flags)) {
- oom_adjust = task->signal->oom_adj;
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
unlock_task_sighand(task, &flags);
}
+ put_task_struct(task);
+ len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ int oom_adj;
+ unsigned long flags;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+ if (err)
+ goto out;
+ if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+ oom_adj != OOM_DISABLE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ task = get_proc_task(file_inode(file));
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
+
+ if (!lock_task_sighand(task, &flags)) {
+ err = -ESRCH;
+ goto err_task_lock;
+ }
+ /*
+ * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+ * value is always attainable.
+ */
+ if (oom_adj == OOM_ADJUST_MAX)
+ oom_adj = OOM_SCORE_ADJ_MAX;
+ else
+ oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_sighand;
+ }
+
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+
+ task->signal->oom_score_adj = oom_adj;
+ trace_oom_score_adj_update(task);
+err_sighand:
+ unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
put_task_struct(task);
+out:
+ return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+ .read = oom_adj_read,
+ .write = oom_adj_write,
+ .llseek = generic_file_llseek,
+};
- len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ char buffer[PROC_NUMBUF];
+ short oom_score_adj = OOM_SCORE_ADJ_MIN;
+ unsigned long flags;
+ size_t len;
+ if (!task)
+ return -ESRCH;
+ if (lock_task_sighand(task, &flags)) {
+ oom_score_adj = task->signal->oom_score_adj;
+ unlock_task_sighand(task, &flags);
+ }
+ put_task_struct(task);
+ len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
-static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
{
struct task_struct *task;
char buffer[PROC_NUMBUF];
- long oom_adjust;
unsigned long flags;
+ int oom_score_adj;
int err;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- return -EFAULT;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
- err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+ err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
if (err)
- return -EINVAL;
- if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
- oom_adjust != OOM_DISABLE)
- return -EINVAL;
+ goto out;
+ if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+ oom_score_adj > OOM_SCORE_ADJ_MAX) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ task = get_proc_task(file_inode(file));
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
- task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task)
- return -ESRCH;
if (!lock_task_sighand(task, &flags)) {
- put_task_struct(task);
- return -ESRCH;
+ err = -ESRCH;
+ goto err_task_lock;
}
- if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- unlock_task_sighand(task, &flags);
- put_task_struct(task);
- return -EACCES;
+ if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_sighand;
}
- task->signal->oom_adj = oom_adjust;
+ task->signal->oom_score_adj = (short)oom_score_adj;
+ if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = (short)oom_score_adj;
+ trace_oom_score_adj_update(task);
+err_sighand:
unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
put_task_struct(task);
-
- return count;
+out:
+ return err < 0 ? err : count;
}
-static const struct file_operations proc_oom_adjust_operations = {
- .read = oom_adjust_read,
- .write = oom_adjust_write,
+static const struct file_operations proc_oom_score_adj_operations = {
+ .read = oom_score_adj_read,
+ .write = oom_score_adj_write,
+ .llseek = default_llseek,
};
#ifdef CONFIG_AUDITSYSCALL
@@ -1071,7 +1068,7 @@ static const struct file_operations proc_oom_adjust_operations = {
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
struct task_struct *task = get_proc_task(inode);
ssize_t length;
char tmpbuf[TMPBUFLEN];
@@ -1079,7 +1076,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
- audit_get_loginuid(task));
+ from_kuid(file->f_cred->user_ns,
+ audit_get_loginuid(task)));
put_task_struct(task);
return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}
@@ -1087,16 +1085,18 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *page, *tmp;
ssize_t length;
uid_t loginuid;
+ kuid_t kloginuid;
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-
- if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
+ rcu_read_lock();
+ if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+ rcu_read_unlock();
return -EPERM;
+ }
+ rcu_read_unlock();
if (count >= PAGE_SIZE)
count = PAGE_SIZE - 1;
@@ -1119,7 +1119,19 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(current, loginuid);
+
+ /* is userspace tring to explicitly UNSET the loginuid? */
+ if (loginuid == AUDIT_UID_UNSET) {
+ kloginuid = INVALID_UID;
+ } else {
+ kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+ if (!uid_valid(kloginuid)) {
+ length = -EINVAL;
+ goto out_free_page;
+ }
+ }
+
+ length = audit_set_loginuid(kloginuid);
if (likely(length == 0))
length = count;
@@ -1131,12 +1143,13 @@ out_free_page:
static const struct file_operations proc_loginuid_operations = {
.read = proc_loginuid_read,
.write = proc_loginuid_write,
+ .llseek = generic_file_llseek,
};
static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
struct task_struct *task = get_proc_task(inode);
ssize_t length;
char tmpbuf[TMPBUFLEN];
@@ -1151,6 +1164,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
static const struct file_operations proc_sessionid_operations = {
.read = proc_sessionid_read,
+ .llseek = generic_file_llseek,
};
#endif
@@ -1158,7 +1172,7 @@ static const struct file_operations proc_sessionid_operations = {
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
size_t len;
int make_it_fail;
@@ -1190,7 +1204,10 @@ static ssize_t proc_fault_inject_write(struct file * file,
make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
- task = get_proc_task(file->f_dentry->d_inode);
+ if (make_it_fail < 0 || make_it_fail > 1)
+ return -EINVAL;
+
+ task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
task->make_it_fail = make_it_fail;
@@ -1202,6 +1219,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
static const struct file_operations proc_fault_inject_operations = {
.read = proc_fault_inject_read,
.write = proc_fault_inject_write,
+ .llseek = generic_file_llseek,
};
#endif
@@ -1229,7 +1247,7 @@ static ssize_t
sched_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct task_struct *p;
p = get_proc_task(inode);
@@ -1244,9 +1262,76 @@ sched_write(struct file *file, const char __user *buf,
static int sched_open(struct inode *inode, struct file *filp)
{
+ return single_open(filp, sched_show, inode);
+}
+
+static const struct file_operations proc_pid_sched_operations = {
+ .open = sched_open,
+ .read = seq_read,
+ .write = sched_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_autogroup_show_task(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ char buffer[PROC_NUMBUF];
+ int nice;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ err = kstrtoint(strstrip(buffer), 0, &nice);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ err = proc_sched_autogroup_set_nice(p, nice);
+ if (err)
+ count = err;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
int ret;
- ret = single_open(filp, sched_show, NULL);
+ ret = single_open(filp, sched_autogroup_show, NULL);
if (!ret) {
struct seq_file *m = filp->private_data;
@@ -1255,27 +1340,26 @@ static int sched_open(struct inode *inode, struct file *filp)
return ret;
}
-static const struct file_operations proc_pid_sched_operations = {
- .open = sched_open,
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+ .open = sched_autogroup_open,
.read = seq_read,
- .write = sched_write,
+ .write = sched_autogroup_write,
.llseek = seq_lseek,
.release = single_release,
};
-#endif
+#endif /* CONFIG_SCHED_AUTOGROUP */
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct task_struct *p;
char buffer[TASK_COMM_LEN];
+ const size_t maxlen = sizeof(buffer) - 1;
memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
+ if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
return -EFAULT;
p = get_proc_task(inode);
@@ -1312,15 +1396,7 @@ static int comm_show(struct seq_file *m, void *v)
static int comm_open(struct inode *inode, struct file *filp)
{
- int ret;
-
- ret = single_open(filp, comm_show, NULL);
- if (!ret) {
- struct seq_file *m = filp->private_data;
-
- m->private = inode;
- }
- return ret;
+ return single_open(filp, comm_show, inode);
}
static const struct file_operations proc_pid_set_comm_operations = {
@@ -1331,64 +1407,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
-{
- if (new_exe_file)
- get_file(new_exe_file);
- if (mm->exe_file)
- fput(mm->exe_file);
- mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
-}
-
-struct file *get_mm_exe_file(struct mm_struct *mm)
-{
- struct file *exe_file;
-
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
- down_read(&mm->mmap_sem);
- exe_file = mm->exe_file;
- if (exe_file)
- get_file(exe_file);
- up_read(&mm->mmap_sem);
- return exe_file;
-}
-
-void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
- /* It's safe to write the exe_file pointer without exe_file_lock because
- * this is called during fork when the task is not yet in /proc */
- newmm->exe_file = get_mm_exe_file(oldmm);
-}
-
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(inode);
+ task = get_proc_task(dentry->d_inode);
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1409,17 +1434,19 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path)
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
+ struct path path;
int error = -EACCES;
- /* We don't need a base pointer in the /proc filesystem */
- path_put(&nd->path);
-
/* Are we allowed to snoop on the tasks file descriptors? */
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
- nd->last_type = LAST_BIND;
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ if (error)
+ goto out;
+
+ nd_jump_link(nd, &path);
+ return NULL;
out:
return ERR_PTR(error);
}
@@ -1458,7 +1485,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
if (error)
goto out;
@@ -1468,7 +1495,7 @@ out:
return error;
}
-static const struct inode_operations proc_pid_link_inode_operations = {
+const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
@@ -1477,23 +1504,7 @@ static const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */
-static int task_dumpable(struct task_struct *task)
-{
- int dumpable = 0;
- struct mm_struct *mm;
-
- task_lock(task);
- mm = task->mm;
- if (mm)
- dumpable = get_dumpable(mm);
- task_unlock(task);
- if(dumpable == 1)
- return 1;
- return 0;
-}
-
-
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
{
struct inode * inode;
struct proc_inode *ei;
@@ -1507,6 +1518,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
/* Common stuff */
ei = PROC_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &proc_def_inode_operations;
@@ -1534,19 +1546,28 @@ out_unlock:
return NULL;
}
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
struct task_struct *task;
const struct cred *cred;
+ struct pid_namespace *pid = dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat);
rcu_read_lock();
- stat->uid = 0;
- stat->gid = 0;
+ stat->uid = GLOBAL_ROOT_UID;
+ stat->gid = GLOBAL_ROOT_GID;
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
+ if (!has_pid_permissions(pid, task, 2)) {
+ rcu_read_unlock();
+ /*
+ * This doesn't prevent learning whether PID exists,
+ * it only makes getattr() consistent with readdir().
+ */
+ return -ENOENT;
+ }
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
task_dumpable(task)) {
cred = __task_cred(task);
@@ -1575,12 +1596,18 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
* made this apply to all per process world readable and executable
* directories.
*/
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
+ struct inode *inode;
+ struct task_struct *task;
const struct cred *cred;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+
if (task) {
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
task_dumpable(task)) {
@@ -1590,8 +1617,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
inode->i_mode &= ~(S_ISUID | S_ISGID);
security_task_to_inode(task, inode);
@@ -1602,16 +1629,21 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
return 0;
}
-static int pid_delete_dentry(struct dentry * dentry)
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+ return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
+int pid_delete_dentry(const struct dentry *dentry)
{
/* Is the task we represent dead?
* If so, then don't put the dentry on the lru list,
* kill it immediately.
*/
- return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+ return proc_inode_is_dead(dentry->d_inode);
}
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
{
.d_revalidate = pid_revalidate,
.d_delete = pid_delete_dentry,
@@ -1619,9 +1651,6 @@ static const struct dentry_operations pid_dentry_operations =
/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
- struct task_struct *, const void *);
-
/*
* Fill a directory entry.
*
@@ -1634,409 +1663,455 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- char *name, int len,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
+ const char *name, int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
+ struct qstr qname = QSTR_INIT(name, len);
struct inode *inode;
- struct qstr qname;
- ino_t ino = 0;
- unsigned type = DT_UNKNOWN;
-
- qname.name = name;
- qname.len = len;
- qname.hash = full_name_hash(name, len);
+ unsigned type;
+ ino_t ino;
- child = d_lookup(dir, &qname);
+ child = d_hash_and_lookup(dir, &qname);
if (!child) {
- struct dentry *new;
- new = d_alloc(dir, &qname);
- if (new) {
- child = instantiate(dir->d_inode, new, task, ptr);
- if (child)
- dput(new);
- else
- child = new;
+ child = d_alloc(dir, &qname);
+ if (!child)
+ goto end_instantiate;
+ if (instantiate(dir->d_inode, child, task, ptr) < 0) {
+ dput(child);
+ goto end_instantiate;
}
}
- if (!child || IS_ERR(child) || !child->d_inode)
- goto end_instantiate;
inode = child->d_inode;
- if (inode) {
- ino = inode->i_ino;
- type = inode->i_mode >> 12;
- }
+ ino = inode->i_ino;
+ type = inode->i_mode >> 12;
dput(child);
+ return dir_emit(ctx, name, len, ino, type);
+
end_instantiate:
- if (!ino)
- ino = find_inode_number(dir, &qname);
- if (!ino)
- ino = 1;
- return filldir(dirent, name, len, filp->f_pos, ino, type);
+ return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-static unsigned name_to_int(struct dentry *dentry)
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
{
- const char *name = dentry->d_name.name;
- int len = dentry->d_name.len;
- unsigned n = 0;
+ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ return -EINVAL;
- if (len > 1 && *name == '0')
- goto out;
- while (len-- > 0) {
- unsigned c = *name++ - '0';
- if (c > 9)
- goto out;
- if (n >= (~0U-9)/10)
- goto out;
- n *= 10;
- n += c;
- }
- return n;
-out:
- return ~0U;
+ return 0;
}
-#define PROC_FDINFO_MAX 64
-
-static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct task_struct *task = get_proc_task(inode);
- struct files_struct *files = NULL;
- struct file *file;
- int fd = proc_fd(inode);
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int status = 0;
- if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
- if (files) {
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (file) {
- if (path) {
- *path = file->f_path;
- path_get(&file->f_path);
- }
- if (info)
- snprintf(info, PROC_FDINFO_MAX,
- "pos:\t%lli\n"
- "flags:\t0%o\n",
- (long long) file->f_pos,
- file->f_flags);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
- return 0;
- }
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ status = -EPERM;
+ goto out_notask;
}
- return -ENOENT;
-}
-static int proc_fd_link(struct inode *inode, struct path *path)
-{
- return proc_fd_info(inode, path, NULL);
-}
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
-static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- int fd = proc_fd(inode);
- struct files_struct *files;
- const struct cred *cred;
+ mm = mm_access(task, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(mm))
+ goto out;
- if (task) {
- files = get_files_struct(task);
- if (files) {
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ down_read(&mm->mmap_sem);
+ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+ up_read(&mm->mmap_sem);
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ if (task_dumpable(task)) {
rcu_read_lock();
- if (fcheck_files(files, fd)) {
- rcu_read_unlock();
- put_files_struct(files);
- if (task_dumpable(task)) {
- rcu_read_lock();
- cred = __task_cred(task);
- inode->i_uid = cred->euid;
- inode->i_gid = cred->egid;
- rcu_read_unlock();
- } else {
- inode->i_uid = 0;
- inode->i_gid = 0;
- }
- inode->i_mode &= ~(S_ISUID | S_ISGID);
- security_task_to_inode(task, inode);
- put_task_struct(task);
- return 1;
- }
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
rcu_read_unlock();
- put_files_struct(files);
+ } else {
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
- put_task_struct(task);
+ security_task_to_inode(task, inode);
+ status = 1;
}
- d_drop(dentry);
- return 0;
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ if (status <= 0)
+ d_drop(dentry);
+
+ return status;
}
-static const struct dentry_operations tid_fd_dentry_operations =
-{
- .d_revalidate = tid_fd_revalidate,
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
.d_delete = pid_delete_dentry,
};
-static struct dentry *proc_fd_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
{
- unsigned fd = *(const unsigned *)ptr;
- struct file *file;
- struct files_struct *files;
- struct inode *inode;
- struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-ENOENT);
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ rc = -ENOENT;
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ up_read(&mm->mmap_sem);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ fmode_t mode;
+ unsigned long len;
+ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static int
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ fmode_t mode = (fmode_t)(unsigned long)ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
inode = proc_pid_make_inode(dir->i_sb, task);
if (!inode)
- goto out;
- ei = PROC_I(inode);
- ei->fd = fd;
- files = get_files_struct(task);
- if (!files)
- goto out_iput;
- inode->i_mode = S_IFLNK;
+ return -ENOENT;
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (!file)
- goto out_unlock;
- if (file->f_mode & FMODE_READ)
- inode->i_mode |= S_IRUSR | S_IXUSR;
- if (file->f_mode & FMODE_WRITE)
- inode->i_mode |= S_IWUSR | S_IXUSR;
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = proc_map_files_get_link;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
- ei->op.proc_get_link = proc_fd_link;
- dentry->d_op = &tid_fd_dentry_operations;
+ inode->i_mode = S_IFLNK;
+
+ if (mode & FMODE_READ)
+ inode->i_mode |= S_IRUSR;
+ if (mode & FMODE_WRITE)
+ inode->i_mode |= S_IWUSR;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, NULL))
- error = NULL;
- out:
- return error;
-out_unlock:
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-out_iput:
- iput(inode);
- goto out;
+ return 0;
}
-static struct dentry *proc_lookupfd_common(struct inode *dir,
- struct dentry *dentry,
- instantiate_t instantiate)
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
- struct task_struct *task = get_proc_task(dir);
- unsigned fd = name_to_int(dentry);
- struct dentry *result = ERR_PTR(-ENOENT);
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ int result;
+ struct mm_struct *mm;
+
+ result = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+ result = -ENOENT;
+ task = get_proc_task(dir);
if (!task)
- goto out_no_task;
- if (fd == ~0U)
goto out;
- result = instantiate(dir, dentry, task, &fd);
-out:
+ result = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ result = -ENOENT;
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_put_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ if (vma->vm_file)
+ result = proc_map_files_instantiate(dir, dentry, task,
+ (void *)(unsigned long)vma->vm_file->f_mode);
+
+out_no_vma:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out_put_task:
put_task_struct(task);
-out_no_task:
- return result;
+out:
+ return ERR_PTR(result);
}
-static int proc_readfd_common(struct file * filp, void * dirent,
- filldir_t filldir, instantiate_t instantiate)
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *p = get_proc_task(inode);
- unsigned int fd, ino;
- int retval;
- struct files_struct * files;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
+ int ret;
- retval = -ENOENT;
- if (!p)
- goto out_no_task;
- retval = 0;
-
- fd = filp->f_pos;
- switch (fd) {
- case 0:
- if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- default:
- files = get_files_struct(p);
- if (!files)
- goto out;
- rcu_read_lock();
- for (fd = filp->f_pos-2;
- fd < files_fdtable(files)->max_fds;
- fd++, filp->f_pos++) {
- char name[PROC_NUMBUF];
- int len;
-
- if (!fcheck_files(files, fd))
- continue;
- rcu_read_unlock();
-
- len = snprintf(name, sizeof(name), "%d", fd);
- if (proc_fill_cache(filp, dirent, filldir,
- name, len, instantiate,
- p, &fd) < 0) {
- rcu_read_lock();
- break;
- }
- rcu_read_lock();
- }
- rcu_read_unlock();
- put_files_struct(files);
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -ENOENT;
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ ret = 0;
+ if (!dir_emit_dots(file, ctx))
+ goto out_put_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+ down_read(&mm->mmap_sem);
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
+
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > ctx->pos)
+ nr_files++;
}
-out:
- put_task_struct(p);
-out_no_task:
- return retval;
-}
-static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
-{
- return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
-}
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_put_task;
+ }
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
+
+ info.mode = vma->vm_file->f_mode;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
+ }
+ }
+ up_read(&mm->mmap_sem);
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
-{
- return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
-}
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ if (!proc_fill_cache(file, ctx,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task,
+ (void *)(unsigned long)p->mode))
+ break;
+ ctx->pos++;
+ }
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
-static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- char tmp[PROC_FDINFO_MAX];
- int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
- if (!err)
- err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
- return err;
+out_put_task:
+ put_task_struct(task);
+out:
+ return ret;
}
-static const struct file_operations proc_fdinfo_file_operations = {
- .open = nonseekable_open,
- .read = proc_fdinfo_read,
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_map_files_readdir,
+ .llseek = default_llseek,
};
-static const struct file_operations proc_fd_operations = {
- .read = generic_read_dir,
- .readdir = proc_readfd,
+struct timers_private {
+ struct pid *pid;
+ struct task_struct *task;
+ struct sighand_struct *sighand;
+ struct pid_namespace *ns;
+ unsigned long flags;
};
-/*
- * /proc/pid/fd needs a special permission handler so that a process can still
- * access /proc/self/fd after it has executed a setuid().
- */
-static int proc_fd_permission(struct inode *inode, int mask)
+static void *timers_start(struct seq_file *m, loff_t *pos)
{
- int rv;
+ struct timers_private *tp = m->private;
- rv = generic_permission(inode, mask, NULL);
- if (rv == 0)
- return 0;
- if (task_pid(current) == proc_pid(inode))
- rv = 0;
- return rv;
+ tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+ if (!tp->task)
+ return ERR_PTR(-ESRCH);
+
+ tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+ if (!tp->sighand)
+ return ERR_PTR(-ESRCH);
+
+ return seq_list_start(&tp->task->signal->posix_timers, *pos);
}
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fd_inode_operations = {
- .lookup = proc_lookupfd,
- .permission = proc_fd_permission,
- .setattr = proc_setattr,
-};
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct timers_private *tp = m->private;
+ return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
-static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
+static void timers_stop(struct seq_file *m, void *v)
{
- unsigned fd = *(unsigned *)ptr;
- struct inode *inode;
- struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-ENOENT);
+ struct timers_private *tp = m->private;
- inode = proc_pid_make_inode(dir->i_sb, task);
- if (!inode)
- goto out;
- ei = PROC_I(inode);
- ei->fd = fd;
- inode->i_mode = S_IFREG | S_IRUSR;
- inode->i_fop = &proc_fdinfo_file_operations;
- dentry->d_op = &tid_fd_dentry_operations;
- d_add(dentry, inode);
- /* Close the race of the process dying before we return the dentry */
- if (tid_fd_revalidate(dentry, NULL))
- error = NULL;
+ if (tp->sighand) {
+ unlock_task_sighand(tp->task, &tp->flags);
+ tp->sighand = NULL;
+ }
- out:
- return error;
+ if (tp->task) {
+ put_task_struct(tp->task);
+ tp->task = NULL;
+ }
}
-static struct dentry *proc_lookupfdinfo(struct inode *dir,
- struct dentry *dentry,
- struct nameidata *nd)
+static int show_timer(struct seq_file *m, void *v)
{
- return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
-}
+ struct k_itimer *timer;
+ struct timers_private *tp = m->private;
+ int notify;
+ static char *nstr[] = {
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
+ };
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
-{
- return proc_readfd_common(filp, dirent, filldir,
- proc_fdinfo_instantiate);
+ timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ notify = timer->it_sigev_notify;
+
+ seq_printf(m, "ID: %d\n", timer->it_id);
+ seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
+ timer->sigq->info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n",
+ nstr[notify & ~SIGEV_THREAD_ID],
+ (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ pid_nr_ns(timer->it_pid, tp->ns));
+ seq_printf(m, "ClockID: %d\n", timer->it_clock);
+
+ return 0;
}
-static const struct file_operations proc_fdinfo_operations = {
- .read = generic_read_dir,
- .readdir = proc_readfdinfo,
+static const struct seq_operations proc_timers_seq_ops = {
+ .start = timers_start,
+ .next = timers_next,
+ .stop = timers_stop,
+ .show = show_timer,
};
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fdinfo_inode_operations = {
- .lookup = proc_lookupfdinfo,
- .setattr = proc_setattr,
-};
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+ struct timers_private *tp;
+ tp = __seq_open_private(file, &proc_timers_seq_ops,
+ sizeof(struct timers_private));
+ if (!tp)
+ return -ENOMEM;
+
+ tp->pid = proc_pid(inode);
+ tp->ns = inode->i_sb->s_fs_info;
+ return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+ .open = proc_timers_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
-static struct dentry *proc_pident_instantiate(struct inode *dir,
+static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-ENOENT);
inode = proc_pid_make_inode(dir->i_sb, task);
if (!inode)
@@ -2045,19 +2120,19 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
ei = PROC_I(inode);
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 2; /* Use getattr to fix if necessary */
+ set_nlink(inode, 2); /* Use getattr to fix if necessary */
if (p->iop)
inode->i_op = p->iop;
if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
- dentry->d_op = &pid_dentry_operations;
+ d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, NULL))
- error = NULL;
+ if (pid_revalidate(dentry, 0))
+ return 0;
out:
- return error;
+ return -ENOENT;
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2065,11 +2140,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
const struct pid_entry *ents,
unsigned int nents)
{
- struct dentry *error;
+ int error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
- error = ERR_PTR(-ENOENT);
+ error = -ENOENT;
if (!task)
goto out_no_task;
@@ -2092,77 +2167,40 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
out:
put_task_struct(task);
out_no_task:
- return error;
-}
-
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
- return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
- proc_pident_instantiate, task, p);
+ return ERR_PTR(error);
}
-static int proc_pident_readdir(struct file *filp,
- void *dirent, filldir_t filldir,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
const struct pid_entry *ents, unsigned int nents)
{
- int i;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- const struct pid_entry *p, *last;
- ino_t ino;
- int ret;
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct pid_entry *p;
- ret = -ENOENT;
if (!task)
- goto out_no_task;
+ return -ENOENT;
- ret = 0;
- i = filp->f_pos;
- switch (i) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- i -= 2;
- if (i >= nents) {
- ret = 1;
- goto out;
- }
- p = ents + i;
- last = &ents[nents - 1];
- while (p <= last) {
- if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
- goto out;
- filp->f_pos++;
- p++;
- }
- }
+ if (!dir_emit_dots(file, ctx))
+ goto out;
- ret = 1;
+ if (ctx->pos >= nents + 2)
+ goto out;
+
+ for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+ if (!proc_fill_cache(file, ctx, p->name, p->len,
+ proc_pident_instantiate, task, p))
+ break;
+ ctx->pos++;
+ }
out:
put_task_struct(task);
-out_no_task:
- return ret;
+ return 0;
}
#ifdef CONFIG_SECURITY
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *p = NULL;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2183,7 +2221,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
- struct inode * inode = file->f_path.dentry->d_inode;
+ struct inode * inode = file_inode(file);
char *page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2209,14 +2247,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
goto out_free;
/* Guard against adverse ptrace interaction */
- length = mutex_lock_interruptible(&task->cred_guard_mutex);
+ length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (length < 0)
goto out_free;
length = security_setprocattr(task,
(char*)file->f_path.dentry->d_name.name,
(void*)page, count);
- mutex_unlock(&task->cred_guard_mutex);
+ mutex_unlock(&task->signal->cred_guard_mutex);
out_free:
free_page((unsigned long) page);
out:
@@ -2228,6 +2266,7 @@ out_no_task:
static const struct file_operations proc_pid_attr_operations = {
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
+ .llseek = generic_file_llseek,
};
static const struct pid_entry attr_dir_stuff[] = {
@@ -2239,20 +2278,20 @@ static const struct pid_entry attr_dir_stuff[] = {
REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
};
-static int proc_attr_dir_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+ return proc_pident_readdir(file, ctx,
+ attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}
static const struct file_operations proc_attr_dir_operations = {
.read = generic_read_dir,
- .readdir = proc_attr_dir_readdir,
+ .iterate = proc_attr_dir_readdir,
+ .llseek = default_llseek,
};
static struct dentry *proc_attr_dir_lookup(struct inode *dir,
- struct dentry *dentry, struct nameidata *nd)
+ struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
@@ -2270,7 +2309,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
char buffer[PROC_NUMBUF];
size_t len;
@@ -2322,7 +2361,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
goto out_no_task;
ret = -ESRCH;
- task = get_proc_task(file->f_dentry->d_inode);
+ task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
@@ -2348,162 +2387,25 @@ static ssize_t proc_coredump_filter_write(struct file *file,
static const struct file_operations proc_coredump_filter_operations = {
.read = proc_coredump_filter_read,
.write = proc_coredump_filter_write,
+ .llseek = generic_file_llseek,
};
#endif
-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
-{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF];
- if (!tgid)
- return -ENOENT;
- sprintf(tmp, "%d", tgid);
- return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF];
- if (!tgid)
- return ERR_PTR(-ENOENT);
- sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
- return ERR_PTR(vfs_follow_link(nd,tmp));
-}
-
-static const struct inode_operations proc_self_inode_operations = {
- .readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
-};
-
-/*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
-static const struct pid_entry proc_base_stuff[] = {
- NOD("self", S_IFLNK|S_IRWXUGO,
- &proc_self_inode_operations, NULL, {}),
-};
-
-/*
- * Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- if (task) {
- put_task_struct(task);
- return 1;
- }
- d_drop(dentry);
- return 0;
-}
-
-static const struct dentry_operations proc_base_dentry_operations =
-{
- .d_revalidate = proc_base_revalidate,
- .d_delete = pid_delete_dentry,
-};
-
-static struct dentry *proc_base_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
- const struct pid_entry *p = ptr;
- struct inode *inode;
- struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-EINVAL);
-
- /* Allocate the inode */
- error = ERR_PTR(-ENOMEM);
- inode = new_inode(dir->i_sb);
- if (!inode)
- goto out;
-
- /* Initialize the inode */
- ei = PROC_I(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
- /*
- * grab the reference to the task.
- */
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
- goto out_iput;
-
- inode->i_mode = p->mode;
- if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 2;
- if (S_ISLNK(inode->i_mode))
- inode->i_size = 64;
- if (p->iop)
- inode->i_op = p->iop;
- if (p->fop)
- inode->i_fop = p->fop;
- ei->op = p->op;
- dentry->d_op = &proc_base_dentry_operations;
- d_add(dentry, inode);
- error = NULL;
-out:
- return error;
-out_iput:
- iput(inode);
- goto out;
-}
-
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
-{
- struct dentry *error;
- struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
-
- error = ERR_PTR(-ENOENT);
-
- if (!task)
- goto out_no_task;
-
- /* Lookup the directory entry */
- last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
- for (p = proc_base_stuff; p <= last; p++) {
- if (p->len != dentry->d_name.len)
- continue;
- if (!memcmp(dentry->d_name.name, p->name, p->len))
- break;
- }
- if (p > last)
- goto out;
-
- error = proc_base_instantiate(dir, dentry, task, p);
-
-out:
- put_task_struct(task);
-out_no_task:
- return error;
-}
-
-static int proc_base_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
- return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
- proc_base_instantiate, task, p);
-}
-
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
{
struct task_io_accounting acct = task->ioac;
unsigned long flags;
+ int result;
+
+ result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (result)
+ return result;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ result = -EACCES;
+ goto out_unlock;
+ }
if (whole && lock_task_sighand(task, &flags)) {
struct task_struct *t = task;
@@ -2514,7 +2416,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
unlock_task_sighand(task, &flags);
}
- return sprintf(buffer,
+ result = sprintf(buffer,
"rchar: %llu\n"
"wchar: %llu\n"
"syscr: %llu\n"
@@ -2529,6 +2431,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
(unsigned long long)acct.read_bytes,
(unsigned long long)acct.write_bytes,
(unsigned long long)acct.cancelled_write_bytes);
+out_unlock:
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ return result;
}
static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
@@ -2542,11 +2447,96 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */
+#ifdef CONFIG_USER_NS
+static int proc_id_map_open(struct inode *inode, struct file *file,
+ struct seq_operations *seq_ops)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ struct seq_file *seq;
+ int ret = -EINVAL;
+
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ ret = seq_open(file, seq_ops);
+ if (ret)
+ goto err_put_ns;
+
+ seq = file->private_data;
+ seq->private = ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_id_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ put_user_ns(ns);
+ return seq_release(inode, file);
+}
+
+static int proc_uid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+}
+
+static int proc_gid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+}
+
+static int proc_projid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+}
+
+static const struct file_operations proc_uid_map_operations = {
+ .open = proc_uid_map_open,
+ .write = proc_uid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_gid_map_operations = {
+ .open = proc_gid_map_open,
+ .write = proc_gid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_projid_map_operations = {
+ .open = proc_projid_map_open,
+ .write = proc_projid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+#endif /* CONFIG_USER_NS */
+
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- seq_printf(m, "%08x\n", task->personality);
- return 0;
+ int err = lock_trace(task);
+ if (!err) {
+ seq_printf(m, "%08x\n", task->personality);
+ unlock_trace(task);
+ }
+ return err;
}
/*
@@ -2558,7 +2548,11 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
@@ -2566,10 +2560,13 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+ REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
INF("syscall", S_IRUSR, proc_pid_syscall),
@@ -2577,9 +2574,9 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -2590,7 +2587,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -2615,7 +2612,8 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2627,23 +2625,35 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, proc_tgid_io_accounting),
+ INF("io", S_IRUSR, proc_tgid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+#endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ REG("timers", S_IRUGO, proc_timers_operations),
#endif
};
-static int proc_tgid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
static const struct file_operations proc_tgid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tgid_base_readdir,
+ .iterate = proc_tgid_base_readdir,
+ .llseek = default_llseek,
};
-static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
return proc_pident_lookup(dir, dentry,
tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
@@ -2652,6 +2662,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.lookup = proc_tgid_base_lookup,
.getattr = pid_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2662,6 +2673,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", pid);
+ /* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
shrink_dcache_parent(dentry);
@@ -2736,17 +2748,12 @@ void proc_flush_task(struct task_struct *task)
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}
-static struct dentry *proc_pid_instantiate(struct inode *dir,
- struct dentry * dentry,
- struct task_struct *task, const void *ptr)
+static int proc_pid_instantiate(struct inode *dir,
+ struct dentry * dentry,
+ struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
struct inode *inode;
inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2758,30 +2765,26 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
- ARRAY_SIZE(tgid_base_stuff));
+ set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
+ ARRAY_SIZE(tgid_base_stuff)));
- dentry->d_op = &pid_dentry_operations;
+ d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, NULL))
- error = NULL;
+ if (pid_revalidate(dentry, 0))
+ return 0;
out:
- return error;
+ return -ENOENT;
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- struct dentry *result = ERR_PTR(-ENOENT);
+ int result = 0;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
- result = proc_base_lookup(dir, dentry);
- if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
- goto out;
-
tgid = name_to_int(dentry);
if (tgid == ~0U)
goto out;
@@ -2798,7 +2801,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
result = proc_pid_instantiate(dir, dentry, task, NULL);
put_task_struct(task);
out:
- return result;
+ return ERR_PTR(result);
}
/*
@@ -2844,50 +2847,44 @@ retry:
return iter;
}
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
-
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct tgid_iter iter)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", iter.tgid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_pid_instantiate, iter.task, NULL);
-}
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
/* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
- struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
struct tgid_iter iter;
- struct pid_namespace *ns;
+ struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
+ loff_t pos = ctx->pos;
- if (!reaper)
- goto out_no_task;
+ if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+ return 0;
- for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
- const struct pid_entry *p = &proc_base_stuff[nr];
- if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
- goto out;
+ if (pos == TGID_OFFSET - 1) {
+ struct inode *inode = ns->proc_self->d_inode;
+ if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ return 0;
+ iter.tgid = 0;
+ } else {
+ iter.tgid = pos - TGID_OFFSET;
}
-
- ns = filp->f_dentry->d_sb->s_fs_info;
iter.task = NULL;
- iter.tgid = filp->f_pos - TGID_OFFSET;
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
- filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+ char name[PROC_NUMBUF];
+ int len;
+ if (!has_pid_permissions(ns, iter.task, 2))
+ continue;
+
+ len = snprintf(name, sizeof(name), "%d", iter.tgid);
+ ctx->pos = iter.tgid + TGID_OFFSET;
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_pid_instantiate, iter.task, NULL)) {
put_task_struct(iter.task);
- goto out;
+ return 0;
}
}
- filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
- put_task_struct(reaper);
-out_no_task:
+ ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
return 0;
}
@@ -2896,12 +2893,13 @@ out_no_task:
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
@@ -2912,9 +2910,12 @@ static const struct pid_entry tid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_tid_maps_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ REG("children", S_IRUGO, proc_tid_children_operations),
+#endif
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -2924,7 +2925,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_tid_smaps_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -2949,34 +2950,44 @@ static const struct pid_entry tid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
- REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
- REG("sessionid", S_IRUSR, proc_sessionid_operations),
+ REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, proc_tid_io_accounting),
+ INF("io", S_IRUSR, proc_tid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+ INF("hardwall", S_IRUGO, proc_pid_hardwall),
+#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
#endif
};
-static int proc_tid_base_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
- return proc_pident_readdir(filp,dirent,filldir,
- tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+ return proc_pident_readdir(file, ctx,
+ tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}
-static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
return proc_pident_lookup(dir, dentry,
tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}
static const struct file_operations proc_tid_base_operations = {
.read = generic_read_dir,
- .readdir = proc_tid_base_readdir,
+ .iterate = proc_tid_base_readdir,
+ .llseek = default_llseek,
};
static const struct inode_operations proc_tid_base_inode_operations = {
@@ -2985,10 +2996,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
.setattr = proc_setattr,
};
-static struct dentry *proc_task_instantiate(struct inode *dir,
+static int proc_task_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
- struct dentry *error = ERR_PTR(-ENOENT);
struct inode *inode;
inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2999,22 +3009,22 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
- ARRAY_SIZE(tid_base_stuff));
+ set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
+ ARRAY_SIZE(tid_base_stuff)));
- dentry->d_op = &pid_dentry_operations;
+ d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
- if (pid_revalidate(dentry, NULL))
- error = NULL;
+ if (pid_revalidate(dentry, 0))
+ return 0;
out:
- return error;
+ return -ENOENT;
}
-static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- struct dentry *result = ERR_PTR(-ENOENT);
+ int result = -ENOENT;
struct task_struct *task;
struct task_struct *leader = get_proc_task(dir);
unsigned tid;
@@ -3044,7 +3054,7 @@ out_drop_task:
out:
put_task_struct(leader);
out_no_task:
- return result;
+ return ERR_PTR(result);
}
/*
@@ -3059,34 +3069,42 @@ out_no_task:
* In the case of a seek we start with the leader and walk nr
* threads past it.
*/
-static struct task_struct *first_tid(struct task_struct *leader,
- int tid, int nr, struct pid_namespace *ns)
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+ struct pid_namespace *ns)
{
- struct task_struct *pos;
+ struct task_struct *pos, *task;
+ unsigned long nr = f_pos;
+
+ if (nr != f_pos) /* 32bit overflow? */
+ return NULL;
rcu_read_lock();
- /* Attempt to start with the pid of a thread */
- if (tid && (nr > 0)) {
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ goto fail;
+
+ /* Attempt to start with the tid of a thread */
+ if (tid && nr) {
pos = find_task_by_pid_ns(tid, ns);
- if (pos && (pos->group_leader == leader))
+ if (pos && same_thread_group(pos, task))
goto found;
}
/* If nr exceeds the number of threads there is nothing todo */
- pos = NULL;
- if (nr && nr >= get_nr_threads(leader))
- goto out;
+ if (nr >= get_nr_threads(task))
+ goto fail;
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- for (pos = leader; nr > 0; --nr) {
- pos = next_thread(pos);
- if (pos == leader) {
- pos = NULL;
- goto out;
- }
- }
+ pos = task = task->group_leader;
+ do {
+ if (!nr--)
+ goto found;
+ } while_each_thread(task, pos);
+fail:
+ pos = NULL;
+ goto out;
found:
get_task_struct(pos);
out:
@@ -3116,78 +3134,44 @@ static struct task_struct *next_tid(struct task_struct *start)
return pos;
}
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- struct task_struct *task, int tid)
-{
- char name[PROC_NUMBUF];
- int len = snprintf(name, sizeof(name), "%d", tid);
- return proc_fill_cache(filp, dirent, filldir, name, len,
- proc_task_instantiate, task, NULL);
-}
-
/* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct task_struct *leader = NULL;
+ struct inode *inode = file_inode(file);
struct task_struct *task;
- int retval = -ENOENT;
- ino_t ino;
- int tid;
struct pid_namespace *ns;
+ int tid;
- task = get_proc_task(inode);
- if (!task)
- goto out_no_task;
- rcu_read_lock();
- if (pid_alive(task)) {
- leader = task->group_leader;
- get_task_struct(leader);
- }
- rcu_read_unlock();
- put_task_struct(task);
- if (!leader)
- goto out_no_task;
- retval = 0;
+ if (proc_inode_is_dead(inode))
+ return -ENOENT;
- switch ((unsigned long)filp->f_pos) {
- case 0:
- ino = inode->i_ino;
- if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- case 1:
- ino = parent_ino(dentry);
- if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- /* fall through */
- }
+ if (!dir_emit_dots(file, ctx))
+ return 0;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- ns = filp->f_dentry->d_sb->s_fs_info;
- tid = (int)filp->f_version;
- filp->f_version = 0;
- for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+ ns = file->f_dentry->d_sb->s_fs_info;
+ tid = (int)file->f_version;
+ file->f_version = 0;
+ for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
- task = next_tid(task), filp->f_pos++) {
+ task = next_tid(task), ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
tid = task_pid_nr_ns(task, ns);
- if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+ len = snprintf(name, sizeof(name), "%d", tid);
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- filp->f_version = (u64)tid;
+ file->f_version = (u64)tid;
put_task_struct(task);
break;
}
}
-out:
- put_task_struct(leader);
-out_no_task:
- return retval;
+
+ return 0;
}
static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3208,9 +3192,11 @@ static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
- .readdir = proc_task_readdir,
+ .iterate = proc_task_readdir,
+ .llseek = default_llseek,
};
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1..cbd82dff7e8 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
return 0;
}
-module_init(proc_cmdline_init);
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 00000000000..290ba85cb90
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+ static const struct {
+ short flag;
+ char name;
+ } con_flags[] = {
+ { CON_ENABLED, 'E' },
+ { CON_CONSDEV, 'C' },
+ { CON_BOOT, 'B' },
+ { CON_PRINTBUFFER, 'p' },
+ { CON_BRL, 'b' },
+ { CON_ANYTIME, 'a' },
+ };
+ char flags[ARRAY_SIZE(con_flags) + 1];
+ struct console *con = v;
+ unsigned int a;
+ dev_t dev = 0;
+
+ if (con->device) {
+ const struct tty_driver *driver;
+ int index;
+ driver = con->device(con, &index);
+ if (driver) {
+ dev = MKDEV(driver->major, driver->minor_start);
+ dev += index;
+ }
+ }
+
+ for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+ flags[a] = (con->flags & con_flags[a].flag) ?
+ con_flags[a].name : ' ';
+ flags[a] = 0;
+
+ seq_setwidth(m, 21 - 1);
+ seq_printf(m, "%s%d", con->name, con->index);
+ seq_pad(m, ' ');
+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
+ con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+ flags);
+ if (dev)
+ seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+ seq_printf(m, "\n");
+
+ return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+ struct console *con;
+ loff_t off = 0;
+
+ console_lock();
+ for_each_console(con)
+ if (off++ == *pos)
+ break;
+
+ return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct console *con = v;
+ ++*pos;
+ return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+ console_unlock();
+}
+
+static const struct seq_operations consoles_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = show_console_dev
+};
+
+static int consoles_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &consoles_op);
+}
+
+static const struct file_operations proc_consoles_operations = {
+ .open = consoles_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_consoles_init(void)
+{
+ proc_create("consoles", 0, NULL, &proc_consoles_operations);
+ return 0;
+}
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234..06f4d31e039 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
return 0;
}
-module_init(proc_cpuinfo_init);
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c..50493edc30e 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
if (i < CHRDEV_MAJOR_HASH_SIZE) {
if (i == 0)
- seq_printf(f, "Character devices:\n");
+ seq_puts(f, "Character devices:\n");
chrdev_show(f, i);
}
#ifdef CONFIG_BLOCK
else {
i -= CHRDEV_MAJOR_HASH_SIZE;
if (i == 0)
- seq_printf(f, "\nBlock devices:\n");
+ seq_puts(f, "\nBlock devices:\n");
blkdev_show(f, i);
}
#endif
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
proc_create("devices", 0, NULL, &proc_devinfo_operations);
return 0;
}
-module_init(proc_devices_init);
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
new file mode 100644
index 00000000000..0788d093f5d
--- /dev/null
+++ b/fs/proc/fd.c
@@ -0,0 +1,351 @@
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/dcache.h>
+#include <linux/path.h>
+#include <linux/fdtable.h>
+#include <linux/namei.h>
+#include <linux/pid.h>
+#include <linux/security.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+
+#include <linux/proc_fs.h>
+
+#include "../mount.h"
+#include "internal.h"
+#include "fd.h"
+
+static int seq_show(struct seq_file *m, void *v)
+{
+ struct files_struct *files = NULL;
+ int f_flags = 0, ret = -ENOENT;
+ struct file *file = NULL;
+ struct task_struct *task;
+
+ task = get_proc_task(m->private);
+ if (!task)
+ return -ENOENT;
+
+ files = get_files_struct(task);
+ put_task_struct(task);
+
+ if (files) {
+ int fd = proc_fd(m->private);
+
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+ struct fdtable *fdt = files_fdtable(files);
+
+ f_flags = file->f_flags;
+ if (close_on_exec(fd, fdt))
+ f_flags |= O_CLOEXEC;
+
+ get_file(file);
+ ret = 0;
+ }
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+
+ if (!ret) {
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
+ if (file->f_op->show_fdinfo)
+ ret = file->f_op->show_fdinfo(m, file);
+ fput(file);
+ }
+
+ return ret;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, seq_show, inode);
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+ .open = seq_fdinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct files_struct *files;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int fd;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ fd = proc_fd(inode);
+
+ if (task) {
+ files = get_files_struct(task);
+ if (files) {
+ struct file *file;
+
+ rcu_read_lock();
+ file = fcheck_files(files, fd);
+ if (file) {
+ unsigned f_mode = file->f_mode;
+
+ rcu_read_unlock();
+ put_files_struct(files);
+
+ if (task_dumpable(task)) {
+ rcu_read_lock();
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
+ rcu_read_unlock();
+ } else {
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
+
+ security_task_to_inode(task, inode);
+ put_task_struct(task);
+ return 1;
+ }
+ rcu_read_unlock();
+ put_files_struct(files);
+ }
+ put_task_struct(task);
+ }
+
+ d_drop(dentry);
+ return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations = {
+ .d_revalidate = tid_fd_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_fd_link(struct dentry *dentry, struct path *path)
+{
+ struct files_struct *files = NULL;
+ struct task_struct *task;
+ int ret = -ENOENT;
+
+ task = get_proc_task(dentry->d_inode);
+ if (task) {
+ files = get_files_struct(task);
+ put_task_struct(task);
+ }
+
+ if (files) {
+ int fd = proc_fd(dentry->d_inode);
+ struct file *fd_file;
+
+ spin_lock(&files->file_lock);
+ fd_file = fcheck_files(files, fd);
+ if (fd_file) {
+ *path = fd_file->f_path;
+ path_get(&fd_file->f_path);
+ ret = 0;
+ }
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+
+ return ret;
+}
+
+static int
+proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ unsigned fd = (unsigned long)ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ goto out;
+
+ ei = PROC_I(inode);
+ ei->fd = fd;
+
+ inode->i_mode = S_IFLNK;
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+
+ ei->op.proc_get_link = proc_fd_link;
+
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ d_add(dentry, inode);
+
+ /* Close the race of the process dying before we return the dentry */
+ if (tid_fd_revalidate(dentry, 0))
+ return 0;
+ out:
+ return -ENOENT;
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+ struct dentry *dentry,
+ instantiate_t instantiate)
+{
+ struct task_struct *task = get_proc_task(dir);
+ int result = -ENOENT;
+ unsigned fd = name_to_int(dentry);
+
+ if (!task)
+ goto out_no_task;
+ if (fd == ~0U)
+ goto out;
+
+ result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+out:
+ put_task_struct(task);
+out_no_task:
+ return ERR_PTR(result);
+}
+
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ instantiate_t instantiate)
+{
+ struct task_struct *p = get_proc_task(file_inode(file));
+ struct files_struct *files;
+ unsigned int fd;
+
+ if (!p)
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ files = get_files_struct(p);
+ if (!files)
+ goto out;
+
+ rcu_read_lock();
+ for (fd = ctx->pos - 2;
+ fd < files_fdtable(files)->max_fds;
+ fd++, ctx->pos++) {
+ char name[PROC_NUMBUF];
+ int len;
+
+ if (!fcheck_files(files, fd))
+ continue;
+ rcu_read_unlock();
+
+ len = snprintf(name, sizeof(name), "%d", fd);
+ if (!proc_fill_cache(file, ctx,
+ name, len, instantiate, p,
+ (void *)(unsigned long)fd))
+ goto out_fd_loop;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out_fd_loop:
+ put_files_struct(files);
+out:
+ put_task_struct(p);
+ return 0;
+}
+
+static int proc_readfd(struct file *file, struct dir_context *ctx)
+{
+ return proc_readfd_common(file, ctx, proc_fd_instantiate);
+}
+
+const struct file_operations proc_fd_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_readfd,
+ .llseek = default_llseek,
+};
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+int proc_fd_permission(struct inode *inode, int mask)
+{
+ int rv = generic_permission(inode, mask);
+ if (rv == 0)
+ return 0;
+ if (task_tgid(current) == proc_pid(inode))
+ rv = 0;
+ return rv;
+}
+
+const struct inode_operations proc_fd_inode_operations = {
+ .lookup = proc_lookupfd,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ unsigned fd = (unsigned long)ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ goto out;
+
+ ei = PROC_I(inode);
+ ei->fd = fd;
+
+ inode->i_mode = S_IFREG | S_IRUSR;
+ inode->i_fop = &proc_fdinfo_file_operations;
+
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ d_add(dentry, inode);
+
+ /* Close the race of the process dying before we return the dentry */
+ if (tid_fd_revalidate(dentry, 0))
+ return 0;
+ out:
+ return -ENOENT;
+}
+
+static struct dentry *
+proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+{
+ return proc_readfd_common(file, ctx,
+ proc_fdinfo_instantiate);
+}
+
+const struct inode_operations proc_fdinfo_inode_operations = {
+ .lookup = proc_lookupfdinfo,
+ .setattr = proc_setattr,
+};
+
+const struct file_operations proc_fdinfo_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_readfdinfo,
+ .llseek = default_llseek,
+};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
new file mode 100644
index 00000000000..7c047f256ae
--- /dev/null
+++ b/fs/proc/fd.h
@@ -0,0 +1,19 @@
+#ifndef __PROCFS_FD_H__
+#define __PROCFS_FD_H__
+
+#include <linux/fs.h>
+
+extern const struct file_operations proc_fd_operations;
+extern const struct inode_operations proc_fd_inode_operations;
+
+extern const struct file_operations proc_fdinfo_operations;
+extern const struct inode_operations proc_fdinfo_inode_operations;
+
+extern int proc_fd_permission(struct inode *inode, int mask);
+
+static inline int proc_fd(struct inode *inode)
+{
+ return PROC_I(inode)->fd;
+}
+
+#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065ee..b7f268eb5f4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,7 +12,10 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
+#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
@@ -26,229 +29,13 @@
DEFINE_SPINLOCK(proc_subdir_lock);
-static int proc_match(int len, const char *name, struct proc_dir_entry *de)
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
if (de->namelen != len)
return 0;
return !memcmp(name, de->name, len);
}
-/* buffer size is one page but our output routines use some slack for overruns */
-#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
-
-static ssize_t
-__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos)
-{
- struct inode * inode = file->f_path.dentry->d_inode;
- char *page;
- ssize_t retval=0;
- int eof=0;
- ssize_t n, count;
- char *start;
- struct proc_dir_entry * dp;
- unsigned long long pos;
-
- /*
- * Gaah, please just use "seq_file" instead. The legacy /proc
- * interfaces cut loff_t down to off_t for reads, and ignore
- * the offset entirely for writes..
- */
- pos = *ppos;
- if (pos > MAX_NON_LFS)
- return 0;
- if (nbytes > MAX_NON_LFS - pos)
- nbytes = MAX_NON_LFS - pos;
-
- dp = PDE(inode);
- if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
- return -ENOMEM;
-
- while ((nbytes > 0) && !eof) {
- count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
-
- start = NULL;
- if (dp->read_proc) {
- /*
- * How to be a proc read function
- * ------------------------------
- * Prototype:
- * int f(char *buffer, char **start, off_t offset,
- * int count, int *peof, void *dat)
- *
- * Assume that the buffer is "count" bytes in size.
- *
- * If you know you have supplied all the data you
- * have, set *peof.
- *
- * You have three ways to return data:
- * 0) Leave *start = NULL. (This is the default.)
- * Put the data of the requested offset at that
- * offset within the buffer. Return the number (n)
- * of bytes there are from the beginning of the
- * buffer up to the last byte of data. If the
- * number of supplied bytes (= n - offset) is
- * greater than zero and you didn't signal eof
- * and the reader is prepared to take more data
- * you will be called again with the requested
- * offset advanced by the number of bytes
- * absorbed. This interface is useful for files
- * no larger than the buffer.
- * 1) Set *start = an unsigned long value less than
- * the buffer address but greater than zero.
- * Put the data of the requested offset at the
- * beginning of the buffer. Return the number of
- * bytes of data placed there. If this number is
- * greater than zero and you didn't signal eof
- * and the reader is prepared to take more data
- * you will be called again with the requested
- * offset advanced by *start. This interface is
- * useful when you have a large file consisting
- * of a series of blocks which you want to count
- * and return as wholes.
- * (Hack by Paul.Russell@rustcorp.com.au)
- * 2) Set *start = an address within the buffer.
- * Put the data of the requested offset at *start.
- * Return the number of bytes of data placed there.
- * If this number is greater than zero and you
- * didn't signal eof and the reader is prepared to
- * take more data you will be called again with the
- * requested offset advanced by the number of bytes
- * absorbed.
- */
- n = dp->read_proc(page, &start, *ppos,
- count, &eof, dp->data);
- } else
- break;
-
- if (n == 0) /* end of file */
- break;
- if (n < 0) { /* error */
- if (retval == 0)
- retval = n;
- break;
- }
-
- if (start == NULL) {
- if (n > PAGE_SIZE) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
- n = PAGE_SIZE;
- }
- n -= *ppos;
- if (n <= 0)
- break;
- if (n > count)
- n = count;
- start = page + *ppos;
- } else if (start < page) {
- if (n > PAGE_SIZE) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
- n = PAGE_SIZE;
- }
- if (n > count) {
- /*
- * Don't reduce n because doing so might
- * cut off part of a data block.
- */
- printk(KERN_WARNING
- "proc_file_read: Read count exceeded\n");
- }
- } else /* start >= page */ {
- unsigned long startoff = (unsigned long)(start - page);
- if (n > (PAGE_SIZE - startoff)) {
- printk(KERN_ERR
- "proc_file_read: Apparent buffer overflow!\n");
- n = PAGE_SIZE - startoff;
- }
- if (n > count)
- n = count;
- }
-
- n -= copy_to_user(buf, start < page ? page : start, n);
- if (n == 0) {
- if (retval == 0)
- retval = -EFAULT;
- break;
- }
-
- *ppos += start < page ? (unsigned long)start : n;
- nbytes -= n;
- buf += n;
- retval += n;
- }
- free_page((unsigned long) page);
- return retval;
-}
-
-static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
- loff_t *ppos)
-{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- ssize_t rv = -EIO;
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- spin_unlock(&pde->pde_unload_lock);
-
- rv = __proc_file_read(file, buf, nbytes, ppos);
-
- pde_users_dec(pde);
- return rv;
-}
-
-static ssize_t
-proc_file_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- ssize_t rv = -EIO;
-
- if (pde->write_proc) {
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- spin_unlock(&pde->pde_unload_lock);
-
- /* FIXME: does this routine need ppos? probably... */
- rv = pde->write_proc(file, buffer, count, pde->data);
- pde_users_dec(pde);
- }
- return rv;
-}
-
-
-static loff_t
-proc_file_lseek(struct file *file, loff_t offset, int orig)
-{
- loff_t retval = -EINVAL;
- switch (orig) {
- case 1:
- offset += file->f_pos;
- /* fallthrough */
- case 0:
- if (offset < 0 || offset > MAX_NON_LFS)
- break;
- file->f_pos = retval = offset;
- }
- return retval;
-}
-
-static const struct file_operations proc_file_operations = {
- .llseek = proc_file_lseek,
- .read = proc_file_read,
- .write = proc_file_write,
-};
-
static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = dentry->d_inode;
@@ -257,17 +44,14 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
error = inode_change_ok(inode, iattr);
if (error)
- goto out;
+ return error;
- error = inode_setattr(inode, iattr);
- if (error)
- goto out;
-
- de->uid = inode->i_uid;
- de->gid = inode->i_gid;
+ setattr_copy(inode, iattr);
+ mark_inode_dirty(inode);
+
+ proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
-out:
- return error;
+ return 0;
}
static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -276,7 +60,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode = dentry->d_inode;
struct proc_dir_entry *de = PROC_I(inode)->pde;
if (de && de->nlink)
- inode->i_nlink = de->nlink;
+ set_nlink(inode, de->nlink);
generic_fillattr(inode, stat);
return 0;
@@ -291,19 +75,17 @@ static const struct inode_operations proc_file_inode_operations = {
* returns the struct proc_dir_entry for "/proc/tty/driver", and
* returns "serial" in residual.
*/
-static int xlate_proc_name(const char *name,
- struct proc_dir_entry **ret, const char **residual)
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
{
const char *cp = name, *next;
struct proc_dir_entry *de;
- int len;
- int rtn = 0;
+ unsigned int len;
de = *ret;
if (!de)
de = &proc_root;
- spin_lock(&proc_subdir_lock);
while (1) {
next = strchr(cp, '/');
if (!next)
@@ -315,16 +97,25 @@ static int xlate_proc_name(const char *name,
break;
}
if (!de) {
- rtn = -ENOENT;
- goto out;
+ WARN(1, "name '%s'\n", name);
+ return -ENOENT;
}
cp += len + 1;
}
*residual = cp;
*ret = de;
-out:
+ return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
+{
+ int rv;
+
+ spin_lock(&proc_subdir_lock);
+ rv = __xlate_proc_name(name, ret, residual);
spin_unlock(&proc_subdir_lock);
- return rtn;
+ return rv;
}
static DEFINE_IDA(proc_inum_ida);
@@ -335,58 +126,45 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
/*
* Return an inode number between PROC_DYNAMIC_FIRST and
* 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000 reserved
- * 00000001-00000fff static entries (goners)
- * 001 root-ino
- *
- * 00001000-00001fff unused
- * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
- * 80000000-efffffff unused
- * f0000000-ffffffff dynamic entries
- *
- * Goal:
- * Once we split the thing into several virtual filesystems,
- * we will get rid of magical ranges (and this comment, BTW).
*/
-static unsigned int get_inode_number(void)
+int proc_alloc_inum(unsigned int *inum)
{
unsigned int i;
int error;
retry:
- if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
- return 0;
+ if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
+ return -ENOMEM;
- spin_lock(&proc_inum_lock);
+ spin_lock_irq(&proc_inum_lock);
error = ida_get_new(&proc_inum_ida, &i);
- spin_unlock(&proc_inum_lock);
+ spin_unlock_irq(&proc_inum_lock);
if (error == -EAGAIN)
goto retry;
else if (error)
- return 0;
+ return error;
if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
- spin_lock(&proc_inum_lock);
+ spin_lock_irq(&proc_inum_lock);
ida_remove(&proc_inum_ida, i);
- spin_unlock(&proc_inum_lock);
- return 0;
+ spin_unlock_irq(&proc_inum_lock);
+ return -ENOSPC;
}
- return PROC_DYNAMIC_FIRST + i;
+ *inum = PROC_DYNAMIC_FIRST + i;
+ return 0;
}
-static void release_inode_number(unsigned int inum)
+void proc_free_inum(unsigned int inum)
{
- spin_lock(&proc_inum_lock);
+ unsigned long flags;
+ spin_lock_irqsave(&proc_inum_lock, flags);
ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
- spin_unlock(&proc_inum_lock);
+ spin_unlock_irqrestore(&proc_inum_lock, flags);
}
static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, PDE(dentry->d_inode)->data);
+ nd_set_link(nd, __PDE_DATA(dentry->d_inode));
return NULL;
}
@@ -396,61 +174,35 @@ static const struct inode_operations proc_link_inode_operations = {
};
/*
- * As some entries in /proc are volatile, we want to
- * get rid of unused dentries. This could be made
- * smarter: we could keep a "volatile" flag in the
- * inode to indicate which ones to keep.
- */
-static int proc_delete_dentry(struct dentry * dentry)
-{
- return 1;
-}
-
-static const struct dentry_operations proc_dentry_operations =
-{
- .d_delete = proc_delete_dentry,
-};
-
-/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
*/
struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = NULL;
- int error = -ENOENT;
+ struct inode *inode;
spin_lock(&proc_subdir_lock);
for (de = de->subdir; de ; de = de->next) {
if (de->namelen != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
- unsigned int ino;
-
- ino = de->low_ino;
pde_get(de);
spin_unlock(&proc_subdir_lock);
- error = -EINVAL;
- inode = proc_get_inode(dir->i_sb, ino, de);
- goto out_unlock;
+ inode = proc_get_inode(dir->i_sb, de);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ d_set_d_op(dentry, &simple_dentry_operations);
+ d_add(dentry, inode);
+ return NULL;
}
}
spin_unlock(&proc_subdir_lock);
-out_unlock:
-
- if (inode) {
- dentry->d_op = &proc_dentry_operations;
- d_add(dentry, inode);
- return NULL;
- }
- if (de)
- pde_put(de);
- return ERR_PTR(error);
+ return ERR_PTR(-ENOENT);
}
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
return proc_lookup_de(PDE(dir), dir, dentry);
}
@@ -464,76 +216,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+ struct dir_context *ctx)
{
- unsigned int ino;
int i;
- struct inode *inode = filp->f_path.dentry->d_inode;
- int ret = 0;
-
- ino = inode->i_ino;
- i = filp->f_pos;
- switch (i) {
- case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- case 1:
- if (filldir(dirent, "..", 2, i,
- parent_ino(filp->f_path.dentry),
- DT_DIR) < 0)
- goto out;
- i++;
- filp->f_pos++;
- /* fall through */
- default:
- spin_lock(&proc_subdir_lock);
- de = de->subdir;
- i -= 2;
- for (;;) {
- if (!de) {
- ret = 1;
- spin_unlock(&proc_subdir_lock);
- goto out;
- }
- if (!i)
- break;
- de = de->next;
- i--;
- }
- do {
- struct proc_dir_entry *next;
-
- /* filldir passes info to user space */
- pde_get(de);
- spin_unlock(&proc_subdir_lock);
- if (filldir(dirent, de->name, de->namelen, filp->f_pos,
- de->low_ino, de->mode >> 12) < 0) {
- pde_put(de);
- goto out;
- }
- spin_lock(&proc_subdir_lock);
- filp->f_pos++;
- next = de->next;
- pde_put(de);
- de = next;
- } while (de);
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ spin_lock(&proc_subdir_lock);
+ de = de->subdir;
+ i = ctx->pos - 2;
+ for (;;) {
+ if (!de) {
spin_unlock(&proc_subdir_lock);
+ return 0;
+ }
+ if (!i)
+ break;
+ de = de->next;
+ i--;
}
- ret = 1;
-out:
- return ret;
+
+ do {
+ struct proc_dir_entry *next;
+ pde_get(de);
+ spin_unlock(&proc_subdir_lock);
+ if (!dir_emit(ctx, de->name, de->namelen,
+ de->low_ino, de->mode >> 12)) {
+ pde_put(de);
+ return 0;
+ }
+ spin_lock(&proc_subdir_lock);
+ ctx->pos++;
+ next = de->next;
+ pde_put(de);
+ de = next;
+ } while (de);
+ spin_unlock(&proc_subdir_lock);
+ return 1;
}
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
- return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+ return proc_readdir_de(PDE(inode), file, ctx);
}
/*
@@ -544,7 +272,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
static const struct file_operations proc_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_readdir,
+ .iterate = proc_readdir,
};
/*
@@ -558,35 +286,32 @@ static const struct inode_operations proc_dir_inode_operations = {
static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
- unsigned int i;
struct proc_dir_entry *tmp;
+ int ret;
- i = get_inode_number();
- if (i == 0)
- return -EAGAIN;
- dp->low_ino = i;
+ ret = proc_alloc_inum(&dp->low_ino);
+ if (ret)
+ return ret;
if (S_ISDIR(dp->mode)) {
- if (dp->proc_iops == NULL) {
- dp->proc_fops = &proc_dir_operations;
- dp->proc_iops = &proc_dir_inode_operations;
- }
+ dp->proc_fops = &proc_dir_operations;
+ dp->proc_iops = &proc_dir_inode_operations;
dir->nlink++;
} else if (S_ISLNK(dp->mode)) {
- if (dp->proc_iops == NULL)
- dp->proc_iops = &proc_link_inode_operations;
+ dp->proc_iops = &proc_link_inode_operations;
} else if (S_ISREG(dp->mode)) {
- if (dp->proc_fops == NULL)
- dp->proc_fops = &proc_file_operations;
- if (dp->proc_iops == NULL)
- dp->proc_iops = &proc_file_inode_operations;
+ BUG_ON(dp->proc_fops == NULL);
+ dp->proc_iops = &proc_file_inode_operations;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
}
spin_lock(&proc_subdir_lock);
for (tmp = dir->subdir; tmp; tmp = tmp->next)
if (strcmp(tmp->name, dp->name) == 0) {
- WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+ WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
break;
}
@@ -601,15 +326,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
const char *name,
- mode_t mode,
+ umode_t mode,
nlink_t nlink)
{
struct proc_dir_entry *ent = NULL;
const char *fn = name;
- int len;
+ unsigned int len;
/* make sure name is valid */
- if (!name || !strlen(name)) goto out;
+ if (!name || !strlen(name))
+ goto out;
if (xlate_proc_name(name, parent, &fn) != 0)
goto out;
@@ -620,21 +346,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
len = strlen(fn);
- ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
- if (!ent) goto out;
+ ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+ if (!ent)
+ goto out;
- memset(ent, 0, sizeof(struct proc_dir_entry));
- memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
- ent->name = ((char *) ent) + sizeof(*ent);
+ memcpy(ent->name, fn, len + 1);
ent->namelen = len;
ent->mode = mode;
ent->nlink = nlink;
atomic_set(&ent->count, 1);
- ent->pde_users = 0;
spin_lock_init(&ent->pde_unload_lock);
- ent->pde_unload_completion = NULL;
INIT_LIST_HEAD(&ent->pde_openers);
- out:
+out:
return ent;
}
@@ -662,14 +385,19 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
return ent;
}
+EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
- struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
{
struct proc_dir_entry *ent;
+ if (mode == 0)
+ mode = S_IRUGO | S_IXUGO;
+
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
+ ent->data = data;
if (proc_register(parent, ent) < 0) {
kfree(ent);
ent = NULL;
@@ -677,79 +405,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
}
return ent;
}
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
-struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
- struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+ struct proc_dir_entry *parent)
{
- struct proc_dir_entry *ent;
-
- ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2);
- if (ent) {
- ent->data = net;
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- ent = NULL;
- }
- }
- return ent;
+ return proc_mkdir_data(name, mode, parent, NULL);
}
-EXPORT_SYMBOL_GPL(proc_net_mkdir);
+EXPORT_SYMBOL(proc_mkdir_mode);
struct proc_dir_entry *proc_mkdir(const char *name,
struct proc_dir_entry *parent)
{
- return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+ return proc_mkdir_data(name, 0, parent, NULL);
}
+EXPORT_SYMBOL(proc_mkdir);
-struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
- struct proc_dir_entry *parent)
-{
- struct proc_dir_entry *ent;
- nlink_t nlink;
-
- if (S_ISDIR(mode)) {
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO | S_IXUGO;
- nlink = 2;
- } else {
- if ((mode & S_IFMT) == 0)
- mode |= S_IFREG;
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- nlink = 1;
- }
-
- ent = __proc_create(&parent, name, mode, nlink);
- if (ent) {
- if (proc_register(parent, ent) < 0) {
- kfree(ent);
- ent = NULL;
- }
- }
- return ent;
-}
-
-struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,
void *data)
{
struct proc_dir_entry *pde;
- nlink_t nlink;
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
- if (S_ISDIR(mode)) {
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO | S_IXUGO;
- nlink = 2;
- } else {
- if ((mode & S_IFMT) == 0)
- mode |= S_IFREG;
- if ((mode & S_IALLUGO) == 0)
- mode |= S_IRUGO;
- nlink = 1;
+ if (!S_ISREG(mode)) {
+ WARN_ON(1); /* use proc_mkdir() */
+ return NULL;
}
- pde = __proc_create(&parent, name, mode, nlink);
+ if ((mode & S_IALLUGO) == 0)
+ mode |= S_IRUGO;
+ pde = __proc_create(&parent, name, mode, 1);
if (!pde)
goto out;
pde->proc_fops = proc_fops;
@@ -762,15 +450,24 @@ out_free:
out:
return NULL;
}
-
-static void free_proc_entry(struct proc_dir_entry *de)
+EXPORT_SYMBOL(proc_create_data);
+
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
- unsigned int ino = de->low_ino;
+ de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
- if (ino < PROC_DYNAMIC_FIRST)
- return;
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+ de->uid = uid;
+ de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
- release_inode_number(ino);
+static void free_proc_entry(struct proc_dir_entry *de)
+{
+ proc_free_inum(de->low_ino);
if (S_ISLNK(de->mode))
kfree(de->data);
@@ -791,13 +488,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
struct proc_dir_entry **p;
struct proc_dir_entry *de = NULL;
const char *fn = name;
- int len;
+ unsigned int len;
- if (xlate_proc_name(name, &parent, &fn) != 0)
+ spin_lock(&proc_subdir_lock);
+ if (__xlate_proc_name(name, &parent, &fn) != 0) {
+ spin_unlock(&proc_subdir_lock);
return;
+ }
len = strlen(fn);
- spin_lock(&proc_subdir_lock);
for (p = &parent->subdir; *p; p=&(*p)->next ) {
if (proc_match(len, fn, *p)) {
de = *p;
@@ -807,49 +506,93 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
}
}
spin_unlock(&proc_subdir_lock);
- if (!de)
+ if (!de) {
+ WARN(1, "name '%s'\n", name);
return;
+ }
- spin_lock(&de->pde_unload_lock);
- /*
- * Stop accepting new callers into module. If you're
- * dynamically allocating ->proc_fops, save a pointer somewhere.
- */
- de->proc_fops = NULL;
- /* Wait until all existing callers into module are done. */
- if (de->pde_users > 0) {
- DECLARE_COMPLETION_ONSTACK(c);
+ proc_entry_rundown(de);
- if (!de->pde_unload_completion)
- de->pde_unload_completion = &c;
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
+ de->nlink = 0;
+ WARN(de->subdir, "%s: removing non-empty directory "
+ "'%s/%s', leaking at least '%s'\n", __func__,
+ de->parent->name, de->name, de->subdir->name);
+ pde_put(de);
+}
+EXPORT_SYMBOL(remove_proc_entry);
- spin_unlock(&de->pde_unload_lock);
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+ struct proc_dir_entry **p;
+ struct proc_dir_entry *root = NULL, *de, *next;
+ const char *fn = name;
+ unsigned int len;
- wait_for_completion(de->pde_unload_completion);
+ spin_lock(&proc_subdir_lock);
+ if (__xlate_proc_name(name, &parent, &fn) != 0) {
+ spin_unlock(&proc_subdir_lock);
+ return -ENOENT;
+ }
+ len = strlen(fn);
- goto continue_removing;
+ for (p = &parent->subdir; *p; p=&(*p)->next ) {
+ if (proc_match(len, fn, *p)) {
+ root = *p;
+ *p = root->next;
+ root->next = NULL;
+ break;
+ }
}
- spin_unlock(&de->pde_unload_lock);
-
-continue_removing:
- spin_lock(&de->pde_unload_lock);
- while (!list_empty(&de->pde_openers)) {
- struct pde_opener *pdeo;
-
- pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
- list_del(&pdeo->lh);
- spin_unlock(&de->pde_unload_lock);
- pdeo->release(pdeo->inode, pdeo->file);
- kfree(pdeo);
- spin_lock(&de->pde_unload_lock);
+ if (!root) {
+ spin_unlock(&proc_subdir_lock);
+ return -ENOENT;
}
- spin_unlock(&de->pde_unload_lock);
+ de = root;
+ while (1) {
+ next = de->subdir;
+ if (next) {
+ de->subdir = next->next;
+ next->next = NULL;
+ de = next;
+ continue;
+ }
+ spin_unlock(&proc_subdir_lock);
+
+ proc_entry_rundown(de);
+ next = de->parent;
+ if (S_ISDIR(de->mode))
+ next->nlink--;
+ de->nlink = 0;
+ if (de == root)
+ break;
+ pde_put(de);
- if (S_ISDIR(de->mode))
- parent->nlink--;
- de->nlink = 0;
- WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
- "'%s/%s', leaking at least '%s'\n", __func__,
- de->parent->name, de->name, de->subdir->name);
- pde_put(de);
+ spin_lock(&proc_subdir_lock);
+ de = next;
+ }
+ pde_put(root);
+ return 0;
+}
+EXPORT_SYMBOL(remove_proc_subtree);
+
+void *proc_get_parent_data(const struct inode *inode)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ return de->parent->data;
+}
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+ if (de)
+ remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
+
+void *PDE_DATA(const struct inode *inode)
+{
+ return __PDE_DATA(inode);
}
+EXPORT_SYMBOL(PDE_DATA);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab..0adbc02d60e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,28 +7,36 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
+#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/sysctl.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "internal.h"
-static void proc_delete_inode(struct inode *inode)
+static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
+ struct ctl_table_header *head;
+ const struct proc_ns_operations *ns_ops;
+ void *ns;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
/* Stop tracking associated processes */
put_pid(PROC_I(inode)->pid);
@@ -37,13 +45,18 @@ static void proc_delete_inode(struct inode *inode)
de = PROC_I(inode)->pde;
if (de)
pde_put(de);
- if (PROC_I(inode)->sysctl)
- sysctl_head_put(PROC_I(inode)->sysctl);
- clear_inode(inode);
+ head = PROC_I(inode)->sysctl;
+ if (head) {
+ RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+ sysctl_head_put(head);
+ }
+ /* Release any associated namespace */
+ ns_ops = PROC_I(inode)->ns.ns_ops;
+ ns = PROC_I(inode)->ns.ns;
+ if (ns_ops && ns)
+ ns_ops->put(ns);
}
-struct vfsmount *proc_mnt;
-
static struct kmem_cache * proc_inode_cachep;
static struct inode *proc_alloc_inode(struct super_block *sb)
@@ -60,16 +73,24 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ ei->ns.ns = NULL;
+ ei->ns.ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
}
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
+static void proc_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, proc_i_callback);
+}
+
static void init_once(void *foo)
{
struct proc_inode *ei = (struct proc_inode *) foo;
@@ -86,203 +107,207 @@ void __init proc_init_inodecache(void)
init_once);
}
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct pid_namespace *pid = sb->s_fs_info;
+
+ if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
+ if (pid->hide_pid != 0)
+ seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+ return 0;
+}
+
static const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
- .delete_inode = proc_delete_inode,
+ .evict_inode = proc_evict_inode,
.statfs = simple_statfs,
+ .remount_fs = proc_remount,
+ .show_options = proc_show_options,
};
-static void __pde_users_dec(struct proc_dir_entry *pde)
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
{
- pde->pde_users--;
- if (pde->pde_unload_completion && pde->pde_users == 0)
- complete(pde->pde_unload_completion);
+ return atomic_inc_unless_negative(&pde->in_use);
}
-void pde_users_dec(struct proc_dir_entry *pde)
+static void unuse_pde(struct proc_dir_entry *pde)
{
- spin_lock(&pde->pde_unload_lock);
- __pde_users_dec(pde);
- spin_unlock(&pde->pde_unload_lock);
+ if (atomic_dec_return(&pde->in_use) == BIAS)
+ complete(pde->pde_unload_completion);
}
-static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+/* pde is locked */
+static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- loff_t rv = -EINVAL;
- loff_t (*llseek)(struct file *, loff_t, int);
-
- spin_lock(&pde->pde_unload_lock);
- /*
- * remove_proc_entry() is going to delete PDE (as part of module
- * cleanup sequence). No new callers into module allowed.
- */
- if (!pde->proc_fops) {
+ if (pdeo->closing) {
+ /* somebody else is doing that, just wait */
+ DECLARE_COMPLETION_ONSTACK(c);
+ pdeo->c = &c;
spin_unlock(&pde->pde_unload_lock);
- return rv;
+ wait_for_completion(&c);
+ spin_lock(&pde->pde_unload_lock);
+ } else {
+ struct file *file;
+ pdeo->closing = 1;
+ spin_unlock(&pde->pde_unload_lock);
+ file = pdeo->file;
+ pde->proc_fops->release(file_inode(file), file);
+ spin_lock(&pde->pde_unload_lock);
+ list_del_init(&pdeo->lh);
+ if (pdeo->c)
+ complete(pdeo->c);
+ kfree(pdeo);
}
- /*
- * Bump refcount so that remove_proc_entry will wail for ->llseek to
- * complete.
- */
- pde->pde_users++;
- /*
- * Save function pointer under lock, to protect against ->proc_fops
- * NULL'ifying right after ->pde_unload_lock is dropped.
- */
- llseek = pde->proc_fops->llseek;
- spin_unlock(&pde->pde_unload_lock);
+}
- if (!llseek)
- llseek = default_llseek;
- rv = llseek(file, offset, whence);
+void proc_entry_rundown(struct proc_dir_entry *de)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ /* Wait until all existing callers into module are done. */
+ de->pde_unload_completion = &c;
+ if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+ wait_for_completion(&c);
+
+ spin_lock(&de->pde_unload_lock);
+ while (!list_empty(&de->pde_openers)) {
+ struct pde_opener *pdeo;
+ pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+ close_pdeo(de, pdeo);
+ }
+ spin_unlock(&de->pde_unload_lock);
+}
- pde_users_dec(pde);
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ loff_t rv = -EINVAL;
+ if (use_pde(pde)) {
+ loff_t (*llseek)(struct file *, loff_t, int);
+ llseek = pde->proc_fops->llseek;
+ if (!llseek)
+ llseek = default_llseek;
+ rv = llseek(file, offset, whence);
+ unuse_pde(pde);
+ }
return rv;
}
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- ssize_t rv = -EIO;
ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ ssize_t rv = -EIO;
+ if (use_pde(pde)) {
+ read = pde->proc_fops->read;
+ if (read)
+ rv = read(file, buf, count, ppos);
+ unuse_pde(pde);
}
- pde->pde_users++;
- read = pde->proc_fops->read;
- spin_unlock(&pde->pde_unload_lock);
-
- if (read)
- rv = read(file, buf, count, ppos);
-
- pde_users_dec(pde);
return rv;
}
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- ssize_t rv = -EIO;
ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ ssize_t rv = -EIO;
+ if (use_pde(pde)) {
+ write = pde->proc_fops->write;
+ if (write)
+ rv = write(file, buf, count, ppos);
+ unuse_pde(pde);
}
- pde->pde_users++;
- write = pde->proc_fops->write;
- spin_unlock(&pde->pde_unload_lock);
-
- if (write)
- rv = write(file, buf, count, ppos);
-
- pde_users_dec(pde);
return rv;
}
static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
unsigned int rv = DEFAULT_POLLMASK;
unsigned int (*poll)(struct file *, struct poll_table_struct *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ poll = pde->proc_fops->poll;
+ if (poll)
+ rv = poll(file, pts);
+ unuse_pde(pde);
}
- pde->pde_users++;
- poll = pde->proc_fops->poll;
- spin_unlock(&pde->pde_unload_lock);
-
- if (poll)
- rv = poll(file, pts);
-
- pde_users_dec(pde);
return rv;
}
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
- long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
- int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
- ioctl = pde->proc_fops->ioctl;
- spin_unlock(&pde->pde_unload_lock);
-
- if (unlocked_ioctl) {
- rv = unlocked_ioctl(file, cmd, arg);
- if (rv == -ENOIOCTLCMD)
- rv = -EINVAL;
- } else if (ioctl) {
- lock_kernel();
- rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
- unlock_kernel();
+ long (*ioctl)(struct file *, unsigned int, unsigned long);
+ if (use_pde(pde)) {
+ ioctl = pde->proc_fops->unlocked_ioctl;
+ if (ioctl)
+ rv = ioctl(file, cmd, arg);
+ unuse_pde(pde);
}
-
- pde_users_dec(pde);
return rv;
}
#ifdef CONFIG_COMPAT
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
long rv = -ENOTTY;
long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ compat_ioctl = pde->proc_fops->compat_ioctl;
+ if (compat_ioctl)
+ rv = compat_ioctl(file, cmd, arg);
+ unuse_pde(pde);
}
- pde->pde_users++;
- compat_ioctl = pde->proc_fops->compat_ioctl;
- spin_unlock(&pde->pde_unload_lock);
-
- if (compat_ioctl)
- rv = compat_ioctl(file, cmd, arg);
-
- pde_users_dec(pde);
return rv;
}
#endif
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(file_inode(file));
int rv = -EIO;
int (*mmap)(struct file *, struct vm_area_struct *);
-
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
- return rv;
+ if (use_pde(pde)) {
+ mmap = pde->proc_fops->mmap;
+ if (mmap)
+ rv = mmap(file, vma);
+ unuse_pde(pde);
}
- pde->pde_users++;
- mmap = pde->proc_fops->mmap;
- spin_unlock(&pde->pde_unload_lock);
+ return rv;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
- if (mmap)
- rv = mmap(file, vma);
+ if (use_pde(pde)) {
+ typeof(proc_reg_get_unmapped_area) *get_area;
- pde_users_dec(pde);
+ get_area = pde->proc_fops->get_unmapped_area;
+#ifdef CONFIG_MMU
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
+#endif
+
+ if (get_area)
+ rv = get_area(file, orig_addr, len, pgoff, flags);
+ else
+ rv = orig_addr;
+ unuse_pde(pde);
+ }
return rv;
}
@@ -304,91 +329,47 @@ static int proc_reg_open(struct inode *inode, struct file *file)
* by hand in remove_proc_entry(). For this, save opener's credentials
* for later.
*/
- pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+ pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
- spin_lock(&pde->pde_unload_lock);
- if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
+ if (!use_pde(pde)) {
kfree(pdeo);
- return -EINVAL;
+ return -ENOENT;
}
- pde->pde_users++;
open = pde->proc_fops->open;
release = pde->proc_fops->release;
- spin_unlock(&pde->pde_unload_lock);
if (open)
rv = open(inode, file);
- spin_lock(&pde->pde_unload_lock);
if (rv == 0 && release) {
/* To know what to release. */
- pdeo->inode = inode;
pdeo->file = file;
/* Strictly for "too late" ->release in proc_reg_release(). */
- pdeo->release = release;
+ spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
+ spin_unlock(&pde->pde_unload_lock);
} else
kfree(pdeo);
- __pde_users_dec(pde);
- spin_unlock(&pde->pde_unload_lock);
- return rv;
-}
-static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
- struct inode *inode, struct file *file)
-{
- struct pde_opener *pdeo;
-
- list_for_each_entry(pdeo, &pde->pde_openers, lh) {
- if (pdeo->inode == inode && pdeo->file == file)
- return pdeo;
- }
- return NULL;
+ unuse_pde(pde);
+ return rv;
}
static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
- int rv = 0;
- int (*release)(struct inode *, struct file *);
struct pde_opener *pdeo;
-
spin_lock(&pde->pde_unload_lock);
- pdeo = find_pde_opener(pde, inode, file);
- if (!pde->proc_fops) {
- /*
- * Can't simply exit, __fput() will think that everything is OK,
- * and move on to freeing struct file. remove_proc_entry() will
- * find slacker in opener's list and will try to do non-trivial
- * things with struct file. Therefore, remove opener from list.
- *
- * But if opener is removed from list, who will ->release it?
- */
- if (pdeo) {
- list_del(&pdeo->lh);
- spin_unlock(&pde->pde_unload_lock);
- rv = pdeo->release(inode, file);
- kfree(pdeo);
- } else
- spin_unlock(&pde->pde_unload_lock);
- return rv;
- }
- pde->pde_users++;
- release = pde->proc_fops->release;
- if (pdeo) {
- list_del(&pdeo->lh);
- kfree(pdeo);
+ list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+ if (pdeo->file == file) {
+ close_pdeo(pde, pdeo);
+ break;
+ }
}
spin_unlock(&pde->pde_unload_lock);
-
- if (release)
- rv = release(inode, file);
-
- pde_users_dec(pde);
- return rv;
+ return 0;
}
static const struct file_operations proc_reg_file_ops = {
@@ -401,6 +382,7 @@ static const struct file_operations proc_reg_file_ops = {
.compat_ioctl = proc_reg_compat_ioctl,
#endif
.mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
.release = proc_reg_release,
};
@@ -413,22 +395,19 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
.poll = proc_reg_poll,
.unlocked_ioctl = proc_reg_unlocked_ioctl,
.mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
.open = proc_reg_open,
.release = proc_reg_release,
};
#endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
- struct proc_dir_entry *de)
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode * inode;
+ struct inode *inode = new_inode_pseudo(sb);
- inode = iget_locked(sb, ino);
- if (!inode)
- return NULL;
- if (inode->i_state & I_NEW) {
+ if (inode) {
+ inode->i_ino = de->low_ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- PROC_I(inode)->fd = 0;
PROC_I(inode)->pde = de;
if (de->mode) {
@@ -439,9 +418,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
if (de->size)
inode->i_size = de->size;
if (de->nlink)
- inode->i_nlink = de->nlink;
- if (de->proc_iops)
- inode->i_op = de->proc_iops;
+ set_nlink(inode, de->nlink);
+ WARN_ON(!de->proc_iops);
+ inode->i_op = de->proc_iops;
if (de->proc_fops) {
if (S_ISREG(inode->i_mode)) {
#ifdef CONFIG_COMPAT
@@ -455,15 +434,14 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
inode->i_fop = de->proc_fops;
}
}
- unlock_new_inode(inode);
} else
pde_put(de);
return inode;
-}
+}
int proc_fill_super(struct super_block *s)
{
- struct inode * root_inode;
+ struct inode *root_inode;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
@@ -473,19 +451,17 @@ int proc_fill_super(struct super_block *s)
s->s_time_gran = 1;
pde_get(&proc_root);
- root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
- if (!root_inode)
- goto out_no_root;
- root_inode->i_uid = 0;
- root_inode->i_gid = 0;
- s->s_root = d_alloc_root(root_inode);
- if (!s->s_root)
- goto out_no_root;
- return 0;
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
+
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
-out_no_root:
- printk("proc_read_super: get root inode failed\n");
- iput(root_inode);
- pde_put(&proc_root);
- return -ENOMEM;
+ return proc_setup_self(s);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd1..3ab6d14e71c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -1,4 +1,4 @@
-/* internal.h: internal procfs definitions
+/* Internal procfs definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -10,58 +10,82 @@
*/
#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/binfmts.h>
-extern struct proc_dir_entry proc_root;
-#ifdef CONFIG_PROC_SYSCTL
-extern int proc_sys_init(void);
-#else
-static inline void proc_sys_init(void) { }
-#endif
-#ifdef CONFIG_NET
-extern int proc_net_init(void);
-#else
-static inline int proc_net_init(void) { return 0; }
-#endif
+struct ctl_table_header;
+struct mempolicy;
-struct vmalloc_info {
- unsigned long used;
- unsigned long largest_chunk;
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * The "next" pointer creates a linked list of one /proc directory,
+ * while parent/subdir create the directory structure (every
+ * /proc file has a parent, but "subdir" is NULL for all
+ * non-directory entries).
+ */
+struct proc_dir_entry {
+ unsigned int low_ino;
+ umode_t mode;
+ nlink_t nlink;
+ kuid_t uid;
+ kgid_t gid;
+ loff_t size;
+ const struct inode_operations *proc_iops;
+ const struct file_operations *proc_fops;
+ struct proc_dir_entry *next, *parent, *subdir;
+ void *data;
+ atomic_t count; /* use count */
+ atomic_t in_use; /* number of callers into module in progress; */
+ /* negative -> it's going away RSN */
+ struct completion *pde_unload_completion;
+ struct list_head pde_openers; /* who did ->open, but not ->release */
+ spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+ u8 namelen;
+ char name[];
};
-extern struct mm_struct *mm_for_maps(struct task_struct *);
+union proc_op {
+ int (*proc_get_link)(struct dentry *, struct path *);
+ int (*proc_read)(struct task_struct *task, char *page);
+ int (*proc_show)(struct seq_file *m,
+ struct pid_namespace *ns, struct pid *pid,
+ struct task_struct *task);
+};
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
+struct proc_inode {
+ struct pid *pid;
+ int fd;
+ union proc_op op;
+ struct proc_dir_entry *pde;
+ struct ctl_table_header *sysctl;
+ struct ctl_table *sysctl_entry;
+ struct proc_ns ns;
+ struct inode vfs_inode;
+};
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi) \
-do { \
- (vmi)->used = 0; \
- (vmi)->largest_chunk = 0; \
-} while(0)
-#endif
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+ return container_of(inode, struct proc_inode, vfs_inode);
+}
-extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task);
-extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
-extern const struct file_operations proc_clear_refs_operations;
-extern const struct file_operations proc_pagemap_operations;
-extern const struct file_operations proc_net_operations;
-extern const struct inode_operations proc_net_inode_operations;
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+ return PROC_I(inode)->pde;
+}
-void proc_init_inodecache(void);
+static inline void *__PDE_DATA(const struct inode *inode)
+{
+ return PDE(inode)->data;
+}
static inline struct pid *proc_pid(struct inode *inode)
{
@@ -73,49 +97,195 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
-static inline int proc_fd(struct inode *inode)
+static inline int task_dumpable(struct task_struct *task)
{
- return PROC_I(inode)->fd;
+ int dumpable = 0;
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm)
+ dumpable = get_dumpable(mm);
+ task_unlock(task);
+ if (dumpable == SUID_DUMP_USER)
+ return 1;
+ return 0;
}
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
- struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir);
+static inline unsigned name_to_int(struct dentry *dentry)
+{
+ const char *name = dentry->d_name.name;
+ int len = dentry->d_name.len;
+ unsigned n = 0;
-struct pde_opener {
- struct inode *inode;
- struct file *file;
- int (*release)(struct inode *, struct file *);
- struct list_head lh;
-};
-void pde_users_dec(struct proc_dir_entry *pde);
+ if (len > 1 && *name == '0')
+ goto out;
+ while (len-- > 0) {
+ unsigned c = *name++ - '0';
+ if (c > 9)
+ goto out;
+ if (n >= (~0U-9)/10)
+ goto out;
+ n *= 10;
+ n += c;
+ }
+ return n;
+out:
+ return ~0U;
+}
+
+/*
+ * Offset of the first process in the /proc root directory..
+ */
+#define FIRST_PROCESS_ENTRY 256
+
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
+
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int proc_setattr(struct dentry *, struct iattr *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern int pid_revalidate(struct dentry *, unsigned int);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
+extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
+
+/* Lookups */
+typedef int instantiate_t(struct inode *, struct dentry *,
+ struct task_struct *, const void *);
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+ instantiate_t, struct task_struct *, const void *);
+
+/*
+ * generic.c
+ */
extern spinlock_t proc_subdir_lock;
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
-void task_mem(struct seq_file *, struct mm_struct *);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
+ struct dentry *);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
{
atomic_inc(&pde->count);
return pde;
}
-void pde_put(struct proc_dir_entry *pde);
+extern void pde_put(struct proc_dir_entry *);
-extern struct vfsmount *proc_mnt;
-int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+/*
+ * inode.c
+ */
+struct pde_opener {
+ struct file *file;
+ struct list_head lh;
+ int closing;
+ struct completion *c;
+};
+
+extern const struct inode_operations proc_pid_link_inode_operations;
+
+extern void proc_init_inodecache(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern int proc_fill_super(struct super_block *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
+ * proc_namespaces.c
+ */
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
+extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_sysctl.c
*/
-int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *);
+#else
+static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
+#endif
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+extern int proc_remount(struct super_block *, int *, char *);
+
+/*
+ * task_[no]mmu.c
+ */
+struct proc_maps_private {
+ struct pid *pid;
+ struct task_struct *task;
+#ifdef CONFIG_MMU
+ struct vm_area_struct *tail_vma;
+#endif
+#ifdef CONFIG_NUMA
+ struct mempolicy *task_mempolicy;
+#endif
+};
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+ unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f2..a352d5703b4 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
return 0;
}
-module_init(proc_interrupts_init);
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4..39e6ef32f0b 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -11,20 +11,25 @@
#include <linux/mm.h>
#include <linux/proc_fs.h>
+#include <linux/kcore.h>
#include <linux/user.h>
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
+#include <linux/printk.h>
#include <linux/bootmem.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
#include <linux/memory.h>
#include <asm/sections.h>
+#include "internal.h"
#define CORE_STR "CORE"
@@ -156,7 +161,8 @@ static int kcore_update_ram(void)
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/* calculate vmemmap's address from given system ram pfn and register it */
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -188,7 +194,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
}
#else
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
return 1;
}
@@ -246,10 +253,9 @@ static int kcore_update_ram(void)
/* Not inialized....update now */
/* find out "max pfn" */
end_pfn = 0;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long node_end;
- node_end = NODE_DATA(nid)->node_start_pfn +
- NODE_DATA(nid)->node_spanned_pages;
+ node_end = node_end_pfn(nid);
if (end_pfn < node_end)
end_pfn = node_end;
}
@@ -401,7 +407,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
prpsinfo.pr_zomb = 0;
strcpy(prpsinfo.pr_fname, "vmlinux");
- strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+ strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
nhdr->p_filesz += notesize(&notes[1]);
bufp = storenote(&notes[1], bufp);
@@ -490,7 +496,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
read_unlock(&kclist_lock);
- if (m == NULL) {
+ if (&m->list == &kclist_head) {
if (clear_user(buffer, tsz))
return -EFAULT;
} else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -512,7 +518,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
n = copy_to_user(buffer, (char *)start, tsz);
/*
- * We cannot distingush between fault on source
+ * We cannot distinguish between fault on source
* and fault on destination. When this happens
* we clear too and hope it will trigger the
* EFAULT again.
@@ -557,9 +563,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
+ .llseek = default_llseek,
};
-#ifdef CONFIG_MEMORY_HOTPLUG
/* just remember that we have to update kcore */
static int __meminit kcore_callback(struct notifier_block *self,
unsigned long action, void *arg)
@@ -573,8 +579,11 @@ static int __meminit kcore_callback(struct notifier_block *self,
}
return NOTIFY_OK;
}
-#endif
+static struct notifier_block kcore_callback_nb __meminitdata = {
+ .notifier_call = kcore_callback,
+ .priority = 0,
+};
static struct kcore_list kcore_vmalloc;
@@ -586,7 +595,7 @@ static struct kcore_list kcore_text;
*/
static void __init proc_kcore_text_init(void)
{
- kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+ kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
}
#else
static void __init proc_kcore_text_init(void)
@@ -615,7 +624,7 @@ static int __init proc_kcore_init(void)
proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
&proc_kcore_operations);
if (!proc_root_kcore) {
- printk(KERN_ERR "couldn't create /proc/kcore\n");
+ pr_err("couldn't create /proc/kcore\n");
return 0; /* Always returns 0. */
}
/* Store text area if it's special */
@@ -626,8 +635,8 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- hotplug_memory_notifier(kcore_callback, 0);
+ register_hotmemory_notifier(&kcore_callback_nb);
return 0;
}
-module_init(proc_kcore_init);
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f..05f8dcdb086 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/fs.h>
+#include <linux/syslog.h>
#include <asm/uaccess.h>
#include <asm/io.h>
extern wait_queue_head_t log_wait;
-extern int do_syslog(int type, char __user *bug, int count);
-
static int kmsg_open(struct inode * inode, struct file * file)
{
- return do_syslog(1,NULL,0);
+ return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
}
static int kmsg_release(struct inode * inode, struct file * file)
{
- (void) do_syslog(0,NULL,0);
+ (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
return 0;
}
static ssize_t kmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0))
+ if ((file->f_flags & O_NONBLOCK) &&
+ !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return -EAGAIN;
- return do_syslog(2, buf, count);
+ return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
}
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &log_wait, wait);
- if (do_syslog(9, NULL, 0))
+ if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return POLLIN | POLLRDNORM;
return 0;
}
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
.poll = kmsg_poll,
.open = kmsg_open,
.release = kmsg_release,
+ .llseek = generic_file_llseek,
};
static int __init proc_kmsg_init(void)
@@ -60,4 +61,4 @@ static int __init proc_kmsg_init(void)
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
return 0;
}
-module_init(proc_kmsg_init);
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae..aec66e6c206 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
return 0;
}
-module_init(proc_loadavg_init);
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97..7445af0b1aa 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -1,8 +1,8 @@
#include <linux/fs.h>
-#include <linux/hugetlb.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
@@ -10,7 +10,8 @@
#include <linux/seq_file.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -23,10 +24,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- unsigned long allowed;
struct vmalloc_info vmi;
long cached;
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
int lru;
/*
@@ -36,11 +40,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
- allowed = ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) -
- total_swapcache_pages - i.bufferram;
+ total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
@@ -49,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ *
+ * Free memory cannot be taken below the low watermark, before the
+ * system starts swapping.
+ */
+ available = i.freeram - wmark_low;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemAvailable: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -101,12 +135,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %5lu kB\n"
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "AnonHugePages: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
+ K(available),
K(i.bufferram),
K(cached),
- K(total_swapcache_pages),
+ K(total_swapcache_pages()),
K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
K(pages[LRU_ACTIVE_ANON]),
@@ -143,13 +181,17 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)),
- K(allowed),
+ K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
vmi.largest_chunk >> 10
#ifdef CONFIG_MEMORY_FAILURE
- ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+ ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR)
#endif
);
@@ -178,4 +220,4 @@ static int __init proc_meminfo_init(void)
proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
return 0;
}
-module_init(proc_meminfo_init);
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221dfd01..00000000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
- struct vm_struct *vma;
- unsigned long free_area_size;
- unsigned long prev_end;
-
- vmi->used = 0;
-
- if (!vmlist) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- }
- else {
- vmi->largest_chunk = 0;
-
- prev_end = VMALLOC_START;
-
- read_lock(&vmlist_lock);
-
- for (vma = vmlist; vma; vma = vma->next) {
- unsigned long addr = (unsigned long) vma->addr;
-
- /*
- * Some archs keep another range for modules in vmlist
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
-
- vmi->used += vma->size;
-
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
-
- prev_end = vma->size + addr;
- }
-
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
- read_unlock(&vmlist_lock);
- }
-}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 00000000000..89026095f2b
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,297 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+ &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+ &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+ &ipcns_operations,
+#endif
+#ifdef CONFIG_PID_NS
+ &pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+ &userns_operations,
+#endif
+ &mntns_operations,
+};
+
+static const struct file_operations ns_file_operations = {
+ .llseek = no_llseek,
+};
+
+static const struct inode_operations ns_inode_operations = {
+ .setattr = proc_setattr,
+};
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ struct inode *inode = dentry->d_inode;
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
+
+ return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ ns_ops->name, inode->i_ino);
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+ .d_delete = always_delete_dentry,
+ .d_dname = ns_dname,
+};
+
+static struct dentry *proc_ns_get_dentry(struct super_block *sb,
+ struct task_struct *task, const struct proc_ns_operations *ns_ops)
+{
+ struct dentry *dentry, *result;
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct qstr qname = { .name = "", };
+ void *ns;
+
+ ns = ns_ops->get(task);
+ if (!ns)
+ return ERR_PTR(-ENOENT);
+
+ dentry = d_alloc_pseudo(sb, &qname);
+ if (!dentry) {
+ ns_ops->put(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ inode = iget_locked(sb, ns_ops->inum(ns));
+ if (!inode) {
+ dput(dentry);
+ ns_ops->put(ns);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ei = PROC_I(inode);
+ if (inode->i_state & I_NEW) {
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_op = &ns_inode_operations;
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_fop = &ns_file_operations;
+ ei->ns.ns_ops = ns_ops;
+ ei->ns.ns = ns;
+ unlock_new_inode(inode);
+ } else {
+ ns_ops->put(ns);
+ }
+
+ d_set_d_op(dentry, &ns_dentry_operations);
+ result = d_instantiate_unique(dentry, inode);
+ if (result) {
+ dput(dentry);
+ dentry = result;
+ }
+
+ return dentry;
+}
+
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct proc_inode *ei = PROC_I(inode);
+ struct task_struct *task;
+ struct path ns_path;
+ void *error = ERR_PTR(-EACCES);
+
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
+ if (IS_ERR(ns_path.dentry)) {
+ error = ERR_CAST(ns_path.dentry);
+ goto out_put_task;
+ }
+
+ ns_path.mnt = mntget(nd->path.mnt);
+ nd_jump_link(nd, &ns_path);
+ error = NULL;
+
+out_put_task:
+ put_task_struct(task);
+out:
+ return error;
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+ struct inode *inode = dentry->d_inode;
+ struct proc_inode *ei = PROC_I(inode);
+ const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
+ struct task_struct *task;
+ void *ns;
+ char name[50];
+ int res = -EACCES;
+
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ res = -ENOENT;
+ ns = ns_ops->get(task);
+ if (!ns)
+ goto out_put_task;
+
+ snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
+ res = readlink_copy(buffer, buflen, name);
+ ns_ops->put(ns);
+out_put_task:
+ put_task_struct(task);
+out:
+ return res;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+ .readlink = proc_ns_readlink,
+ .follow_link = proc_ns_follow_link,
+ .setattr = proc_setattr,
+};
+
+static int proc_ns_instantiate(struct inode *dir,
+ struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+ const struct proc_ns_operations *ns_ops = ptr;
+ struct inode *inode;
+ struct proc_inode *ei;
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ goto out;
+
+ ei = PROC_I(inode);
+ inode->i_mode = S_IFLNK|S_IRWXUGO;
+ inode->i_op = &proc_ns_link_inode_operations;
+ ei->ns.ns_ops = ns_ops;
+
+ d_set_d_op(dentry, &pid_dentry_operations);
+ d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, 0))
+ return 0;
+out:
+ return -ENOENT;
+}
+
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct proc_ns_operations **entry, **last;
+
+ if (!task)
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+ goto out;
+ entry = ns_entries + (ctx->pos - 2);
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ const struct proc_ns_operations *ops = *entry;
+ if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops))
+ break;
+ ctx->pos++;
+ entry++;
+ }
+out:
+ put_task_struct(task);
+ return 0;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+ .read = generic_read_dir,
+ .iterate = proc_ns_dir_readdir,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
+{
+ int error;
+ struct task_struct *task = get_proc_task(dir);
+ const struct proc_ns_operations **entry, **last;
+ unsigned int len = dentry->d_name.len;
+
+ error = -ENOENT;
+
+ if (!task)
+ goto out_no_task;
+
+ last = &ns_entries[ARRAY_SIZE(ns_entries)];
+ for (entry = ns_entries; entry < last; entry++) {
+ if (strlen((*entry)->name) != len)
+ continue;
+ if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+ break;
+ }
+ if (entry == last)
+ goto out;
+
+ error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+ put_task_struct(task);
+out_no_task:
+ return ERR_PTR(error);
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+ .lookup = proc_ns_dir_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
+
+struct file *proc_ns_fget(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
+struct proc_ns *get_proc_ns(struct inode *inode)
+{
+ return &PROC_I(inode)->ns;
+}
+
+bool proc_ns_inode(struct inode *inode)
+{
+ return inode->i_fop == &ns_file_operations;
+}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe11..d4a35746cab 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
#include <linux/mmzone.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
@@ -40,19 +39,20 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
- int flags, len;
+ int flags;
flags = region->vm_flags;
file = region->vm_file;
if (file) {
- struct inode *inode = region->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(region->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
}
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
region->vm_start,
region->vm_end,
flags & VM_READ ? 'r' : '-',
@@ -60,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
((loff_t)region->vm_pgoff) << PAGE_SHIFT,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
if (file) {
- len = 25 + sizeof(void *) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
}
@@ -134,4 +131,4 @@ static int __init proc_nommu_init(void)
return 0;
}
-module_init(proc_nommu_init);
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd6..e647c55275d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
ppage = pfn_to_page(pfn);
else
ppage = NULL;
- if (!ppage)
+ if (!ppage || PageSlab(ppage))
pcount = 0;
else
pcount = page_mapcount(ppage);
@@ -115,16 +115,27 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
-
- u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
+ /*
+ * PageTransCompound can be true for non-huge compound pages (slab
+ * pages or pages allocated by drivers with __GFP_COMP) because it
+ * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * to make sure a given page is a thp, not a non-huge compound page.
+ */
+ else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+ PageAnon(compound_head(page))))
+ u |= 1 << KPF_THP;
/*
- * Caveats on high order pages:
- * PG_buddy will only be set on the head page; SLUB/SLQB do the same
- * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+ * Caveats on high order pages: page->_count will only be set
+ * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+ * SLOB won't set PG_slab at all on compound pages.
*/
+ if (PageBuddy(page))
+ u |= 1 << KPF_BUDDY;
+
+ u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
+
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -146,7 +157,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
#endif
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
#endif
@@ -207,4 +218,4 @@ static int __init proc_page_init(void)
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
return 0;
}
-module_init(proc_page_init);
+fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index 123257bb356..00000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * proc_devtree.c - handles /proc/device-tree
- *
- * Copyright 1997 Paul Mackerras
- */
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <asm/prom.h>
-#include <asm/uaccess.h>
-#include "internal.h"
-
-#ifndef HAVE_ARCH_DEVTREE_FIXUPS
-static inline void set_node_proc_entry(struct device_node *np,
- struct proc_dir_entry *de)
-{
-}
-#endif
-
-static struct proc_dir_entry *proc_device_tree;
-
-/*
- * Supply data on a read from /proc/device-tree/node/property.
- */
-static int property_proc_show(struct seq_file *m, void *v)
-{
- struct property *pp = m->private;
-
- seq_write(m, pp->value, pp->length);
- return 0;
-}
-
-static int property_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, property_proc_show, PDE(inode)->data);
-}
-
-static const struct file_operations property_proc_fops = {
- .owner = THIS_MODULE,
- .open = property_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/*
- * For a node with a name like "gc@10", we make symlinks called "gc"
- * and "@10" to it.
- */
-
-/*
- * Add a property to a node
- */
-static struct proc_dir_entry *
-__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
- const char *name)
-{
- struct proc_dir_entry *ent;
-
- /*
- * Unfortunately proc_register puts each new entry
- * at the beginning of the list. So we rearrange them.
- */
- ent = proc_create_data(name,
- strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
- de, &property_proc_fops, pp);
- if (ent == NULL)
- return NULL;
-
- if (!strncmp(name, "security-", 9))
- ent->size = 0; /* don't leak number of password chars */
- else
- ent->size = pp->length;
-
- return ent;
-}
-
-
-void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
-{
- __proc_device_tree_add_prop(pde, prop, prop->name);
-}
-
-void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
- struct property *prop)
-{
- remove_proc_entry(prop->name, pde);
-}
-
-void proc_device_tree_update_prop(struct proc_dir_entry *pde,
- struct property *newprop,
- struct property *oldprop)
-{
- struct proc_dir_entry *ent;
-
- for (ent = pde->subdir; ent != NULL; ent = ent->next)
- if (ent->data == oldprop)
- break;
- if (ent == NULL) {
- printk(KERN_WARNING "device-tree: property \"%s\" "
- " does not exist\n", oldprop->name);
- } else {
- ent->data = newprop;
- ent->size = newprop->length;
- }
-}
-
-/*
- * Various dodgy firmware might give us nodes and/or properties with
- * conflicting names. That's generally ok, except for exporting via /proc,
- * so munge names here to ensure they're unique.
- */
-
-static int duplicate_name(struct proc_dir_entry *de, const char *name)
-{
- struct proc_dir_entry *ent;
- int found = 0;
-
- spin_lock(&proc_subdir_lock);
-
- for (ent = de->subdir; ent != NULL; ent = ent->next) {
- if (strcmp(ent->name, name) == 0) {
- found = 1;
- break;
- }
- }
-
- spin_unlock(&proc_subdir_lock);
-
- return found;
-}
-
-static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
- const char *name)
-{
- char *fixed_name;
- int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
- int i = 1, size;
-
-realloc:
- fixed_name = kmalloc(fixup_len, GFP_KERNEL);
- if (fixed_name == NULL) {
- printk(KERN_ERR "device-tree: Out of memory trying to fixup "
- "name \"%s\"\n", name);
- return name;
- }
-
-retry:
- size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
- size++; /* account for NULL */
-
- if (size > fixup_len) {
- /* We ran out of space, free and reallocate. */
- kfree(fixed_name);
- fixup_len = size;
- goto realloc;
- }
-
- if (duplicate_name(de, fixed_name)) {
- /* Multiple duplicates. Retry with a different offset. */
- i++;
- goto retry;
- }
-
- printk(KERN_WARNING "device-tree: Duplicate name in %s, "
- "renamed to \"%s\"\n", np->full_name, fixed_name);
-
- return fixed_name;
-}
-
-/*
- * Process a node, adding entries for its children and its properties.
- */
-void proc_device_tree_add_node(struct device_node *np,
- struct proc_dir_entry *de)
-{
- struct property *pp;
- struct proc_dir_entry *ent;
- struct device_node *child;
- const char *p;
-
- set_node_proc_entry(np, de);
- for (child = NULL; (child = of_get_next_child(np, child));) {
- /* Use everything after the last slash, or the full name */
- p = strrchr(child->full_name, '/');
- if (!p)
- p = child->full_name;
- else
- ++p;
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = proc_mkdir(p, de);
- if (ent == NULL)
- break;
- proc_device_tree_add_node(child, ent);
- }
- of_node_put(child);
-
- for (pp = np->properties; pp != NULL; pp = pp->next) {
- p = pp->name;
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = __proc_device_tree_add_prop(de, pp, p);
- if (ent == NULL)
- break;
- }
-}
-
-/*
- * Called on initialization to set up the /proc/device-tree subtree
- */
-void __init proc_device_tree_init(void)
-{
- struct device_node *root;
-
- proc_device_tree = proc_mkdir("device-tree", NULL);
- if (proc_device_tree == NULL)
- return;
- root = of_find_node_by_path("/");
- if (root == NULL) {
- printk(KERN_ERR "/proc/device-tree: can't find root\n");
- return;
- }
- proc_device_tree_add_node(root, proc_device_tree);
- of_node_put(root);
-}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c3..4677bb7dc7c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/module.h>
@@ -25,6 +26,10 @@
#include "internal.h"
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+ return pde->parent->data;
+}
static struct net *get_proc_net(const struct inode *inode)
{
@@ -118,7 +123,7 @@ static struct net *get_proc_task_net(struct inode *dir)
}
static struct dentry *proc_tgid_net_lookup(struct inode *dir,
- struct dentry *dentry, struct nameidata *nd)
+ struct dentry *dentry, unsigned int flags)
{
struct dentry *de;
struct net *net;
@@ -155,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
.getattr = proc_tgid_net_getattr,
};
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
- filldir_t filldir)
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
{
int ret;
struct net *net;
ret = -EINVAL;
- net = get_proc_task_net(filp->f_path.dentry->d_inode);
+ net = get_proc_task_net(file_inode(file));
if (net != NULL) {
- ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+ ret = proc_readdir_de(net->proc_net, file, ctx);
put_net(net);
}
return ret;
@@ -173,38 +177,24 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
const struct file_operations proc_net_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = proc_tgid_net_readdir,
+ .iterate = proc_tgid_net_readdir,
};
-
-struct proc_dir_entry *proc_net_fops_create(struct net *net,
- const char *name, mode_t mode, const struct file_operations *fops)
-{
- return proc_create(name, mode, net->proc_net, fops);
-}
-EXPORT_SYMBOL_GPL(proc_net_fops_create);
-
-void proc_net_remove(struct net *net, const char *name)
-{
- remove_proc_entry(name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(proc_net_remove);
-
static __net_init int proc_net_ns_init(struct net *net)
{
struct proc_dir_entry *netd, *net_statd;
int err;
err = -ENOMEM;
- netd = kzalloc(sizeof(*netd), GFP_KERNEL);
+ netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
if (!netd)
goto out;
netd->data = net;
netd->nlink = 2;
- netd->name = "net";
netd->namelen = 3;
netd->parent = &proc_root;
+ memcpy(netd->name, "net", 4);
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a1..71290463a1d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,8 +3,14 @@
*/
#include <linux/init.h>
#include <linux/sysctl.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
+#include <linux/printk.h>
#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/module.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -13,6 +19,379 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+ if (!poll)
+ return;
+
+ atomic_inc(&poll->event);
+ wake_up_interruptible(&poll->wait);
+}
+
+static struct ctl_table root_table[] = {
+ {
+ .procname = "",
+ .mode = S_IFDIR|S_IRUGO|S_IXUGO,
+ },
+ { }
+};
+static struct ctl_table_root sysctl_table_root = {
+ .default_set.dir.header = {
+ {{.count = 1,
+ .nreg = 1,
+ .ctl_table = root_table }},
+ .ctl_table_arg = root_table,
+ .root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
+ },
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+ if (dir->header.parent)
+ sysctl_print_dir(dir->header.parent);
+ pr_cont("%s/", dir->header.ctl_table[0].procname);
+}
+
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+ int minlen;
+ int cmp;
+
+ minlen = len1;
+ if (minlen > len2)
+ minlen = len2;
+
+ cmp = memcmp(name1, name2, minlen);
+ if (cmp == 0)
+ cmp = len1 - len2;
+ return cmp;
+}
+
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir, const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+ struct rb_node *node = dir->root.rb_node;
+
+ while (node)
+ {
+ struct ctl_node *ctl_node;
+ const char *procname;
+ int cmp;
+
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ procname = entry->procname;
+
+ cmp = namecmp(name, namelen, procname, strlen(procname));
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else {
+ *phead = head;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+ struct rb_node **p = &head->parent->root.rb_node;
+ struct rb_node *parent = NULL;
+ const char *name = entry->procname;
+ int namelen = strlen(name);
+
+ while (*p) {
+ struct ctl_table_header *parent_head;
+ struct ctl_table *parent_entry;
+ struct ctl_node *parent_node;
+ const char *parent_name;
+ int cmp;
+
+ parent = *p;
+ parent_node = rb_entry(parent, struct ctl_node, node);
+ parent_head = parent_node->header;
+ parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+ parent_name = parent_entry->procname;
+
+ cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
+ pr_err("sysctl duplicate entry: ");
+ sysctl_print_dir(head->parent);
+ pr_cont("/%s\n", entry->procname);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, &head->parent->root);
+ return 0;
+}
+
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+
+ rb_erase(node, &head->parent->root);
+}
+
+static void init_header(struct ctl_table_header *head,
+ struct ctl_table_root *root, struct ctl_table_set *set,
+ struct ctl_node *node, struct ctl_table *table)
+{
+ head->ctl_table = table;
+ head->ctl_table_arg = table;
+ head->used = 0;
+ head->count = 1;
+ head->nreg = 1;
+ head->unregistering = NULL;
+ head->root = root;
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
+ if (node) {
+ struct ctl_table *entry;
+ for (entry = table; entry->procname; entry++, node++)
+ node->header = head;
+ }
+}
+
+static void erase_header(struct ctl_table_header *head)
+{
+ struct ctl_table *entry;
+ for (entry = head->ctl_table; entry->procname; entry++)
+ erase_entry(head, entry);
+}
+
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+ struct ctl_table *entry;
+ int err;
+
+ dir->header.nreg++;
+ header->parent = dir;
+ err = insert_links(header);
+ if (err)
+ goto fail_links;
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ err = insert_entry(header, entry);
+ if (err)
+ goto fail;
+ }
+ return 0;
+fail:
+ erase_header(header);
+ put_links(header);
+fail_links:
+ header->parent = NULL;
+ drop_sysctl_table(&dir->header);
+ return err;
+}
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+ if (unlikely(p->unregistering))
+ return 0;
+ p->used++;
+ return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+ if (!--p->used)
+ if (unlikely(p->unregistering))
+ complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+ /*
+ * if p->used is 0, nobody will ever touch that entry again;
+ * we'll eliminate all paths to it before dropping sysctl_lock
+ */
+ if (unlikely(p->used)) {
+ struct completion wait;
+ init_completion(&wait);
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+ spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
+ }
+ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ erase_header(p);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ BUG_ON(!head);
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root, namespaces);
+ return set;
+}
+
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ spin_lock(&sysctl_lock);
+ entry = find_entry(&head, dir, name, namelen);
+ if (entry && use_table(head))
+ *phead = head;
+ else
+ entry = NULL;
+ spin_unlock(&sysctl_lock);
+ return entry;
+}
+
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+ struct ctl_node *ctl_node;
+
+ for (;node; node = rb_next(node)) {
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ if (use_table(ctl_node->header))
+ return ctl_node;
+ }
+ return NULL;
+}
+
+static void first_entry(struct ctl_dir *dir,
+ struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = NULL;
+ struct ctl_table *entry = NULL;
+ struct ctl_node *ctl_node;
+
+ spin_lock(&sysctl_lock);
+ ctl_node = first_usable_entry(rb_first(&dir->root));
+ spin_unlock(&sysctl_lock);
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = *phead;
+ struct ctl_table *entry = *pentry;
+ struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+
+ ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+ spin_unlock(&sysctl_lock);
+ head = NULL;
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
+ mode >>= 6;
+ else if (in_egroup_p(GLOBAL_ROOT_GID))
+ mode >>= 3;
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+{
+ struct ctl_table_root *root = head->root;
+ int mode;
+
+ if (root->permissions)
+ mode = root->permissions(head, table);
+ else
+ mode = table->mode;
+
+ return test_perm(mode, op);
+}
+
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
@@ -23,21 +402,21 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
if (!inode)
goto out;
+ inode->i_ino = get_next_ino();
+
sysctl_head_get(head);
ei = PROC_I(inode);
ei->sysctl = head;
ei->sysctl_entry = table;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
inode->i_mode = table->mode;
- if (!table->child) {
+ if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
inode->i_fop = &proc_sys_file_operations;
} else {
inode->i_mode |= S_IFDIR;
- inode->i_nlink = 0;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
}
@@ -45,83 +424,54 @@ out:
return inode;
}
-static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
-{
- int len;
- for ( ; p->procname; p++) {
-
- if (!p->procname)
- continue;
-
- len = strlen(p->procname);
- if (len != name->len)
- continue;
-
- if (memcmp(p->procname, name->name, len) != 0)
- continue;
-
- /* I have a match */
- return p;
- }
- return NULL;
-}
-
static struct ctl_table_header *grab_header(struct inode *inode)
{
- if (PROC_I(inode)->sysctl)
- return sysctl_head_grab(PROC_I(inode)->sysctl);
- else
- return sysctl_head_next(NULL);
+ struct ctl_table_header *head = PROC_I(inode)->sysctl;
+ if (!head)
+ head = &sysctl_table_root.default_set.dir.header;
+ return sysctl_head_grab(head);
}
static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
struct ctl_table_header *head = grab_header(dir);
- struct ctl_table *table = PROC_I(dir)->sysctl_entry;
struct ctl_table_header *h = NULL;
struct qstr *name = &dentry->d_name;
struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
+ struct ctl_dir *ctl_dir;
+ int ret;
if (IS_ERR(head))
return ERR_CAST(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
-
- table = table ? table->child : head->ctl_table;
-
- p = find_in_table(table, name);
- if (!p) {
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- p = find_in_table(h->attached_by, name);
- if (p)
- break;
- }
- }
+ ctl_dir = container_of(head, struct ctl_dir, header);
+ p = lookup_entry(&h, ctl_dir, name->name, name->len);
if (!p)
goto out;
+ if (S_ISLNK(p->mode)) {
+ ret = sysctl_follow_link(&h, &p, current->nsproxy);
+ err = ERR_PTR(ret);
+ if (ret)
+ goto out;
+ }
+
err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (h)
- sysctl_head_finish(h);
-
if (!inode)
goto out;
err = NULL;
- dentry->d_op = &proc_sys_dentry_operations;
+ d_set_d_op(dentry, &proc_sys_dentry_operations);
d_add(dentry, inode);
out:
+ if (h)
+ sysctl_head_finish(h);
sysctl_head_finish(head);
return err;
}
@@ -129,7 +479,7 @@ out:
static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
size_t count, loff_t *ppos, int write)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
ssize_t error;
@@ -143,7 +493,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
* and won't be until we finish.
*/
error = -EPERM;
- if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+ if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
goto out;
/* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -174,13 +524,61 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
}
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ if (table->poll)
+ filp->private_data = proc_sys_poll_event(table->poll);
+
+ sysctl_head_finish(head);
+
+ return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+ struct inode *inode = file_inode(filp);
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ unsigned int ret = DEFAULT_POLLMASK;
+ unsigned long event;
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir,
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return POLLERR | POLLHUP;
+
+ if (!table->proc_handler)
+ goto out;
+
+ if (!table->poll)
+ goto out;
+
+ event = (unsigned long)filp->private_data;
+ poll_wait(filp, &table->poll->wait, wait);
+
+ if (event != atomic_read(&table->poll->event)) {
+ filp->private_data = proc_sys_poll_event(table->poll);
+ ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+out:
+ sysctl_head_finish(head);
+
+ return ret;
+}
+
+static bool proc_sys_fill_cache(struct file *file,
+ struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
- struct dentry *child, *dir = filp->f_path.dentry;
+ struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
@@ -197,99 +595,89 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
dput(child);
- return -ENOMEM;
+ return false;
} else {
- child->d_op = &proc_sys_dentry_operations;
+ d_set_d_op(child, &proc_sys_dentry_operations);
d_add(child, inode);
}
} else {
- return -ENOMEM;
+ return false;
}
}
inode = child->d_inode;
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
- return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+ return dir_emit(ctx, qname.name, qname.len, ino, type);
+}
+
+static bool proc_sys_link_fill_cache(struct file *file,
+ struct dir_context *ctx,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ bool ret = true;
+ head = sysctl_head_grab(head);
+
+ if (S_ISLNK(table->mode)) {
+ /* It is not an error if we can not follow the link ignore it */
+ int err = sysctl_follow_link(&head, &table, current->nsproxy);
+ if (err)
+ goto out;
+ }
+
+ ret = proc_sys_fill_cache(file, ctx, head, table);
+out:
+ sysctl_head_finish(head);
+ return ret;
}
static int scan(struct ctl_table_header *head, ctl_table *table,
unsigned long *pos, struct file *file,
- void *dirent, filldir_t filldir)
+ struct dir_context *ctx)
{
+ bool res;
- for (; table->procname; table++, (*pos)++) {
- int res;
-
- /* Can't do anything without a proc name */
- if (!table->procname)
- continue;
+ if ((*pos)++ < ctx->pos)
+ return true;
- if (*pos < file->f_pos)
- continue;
+ if (unlikely(S_ISLNK(table->mode)))
+ res = proc_sys_link_fill_cache(file, ctx, head, table);
+ else
+ res = proc_sys_fill_cache(file, ctx, head, table);
- res = proc_sys_fill_cache(file, dirent, filldir, head, table);
- if (res)
- return res;
+ if (res)
+ ctx->pos = *pos;
- file->f_pos = *pos + 1;
- }
- return 0;
+ return res;
}
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
+ struct ctl_table *entry;
+ struct ctl_dir *ctl_dir;
unsigned long pos;
- int ret = -EINVAL;
if (IS_ERR(head))
return PTR_ERR(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
+ ctl_dir = container_of(head, struct ctl_dir, header);
- table = table ? table->child : head->ctl_table;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- ret = 0;
- /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
- if (filp->f_pos == 0) {
- if (filldir(dirent, ".", 1, filp->f_pos,
- inode->i_ino, DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
- if (filp->f_pos == 1) {
- if (filldir(dirent, "..", 2, filp->f_pos,
- parent_ino(dentry), DT_DIR) < 0)
- goto out;
- filp->f_pos++;
- }
pos = 2;
- ret = scan(head, table, &pos, filp, dirent, filldir);
- if (ret)
- goto out;
-
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
- if (ret) {
+ for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
+ if (!scan(h, entry, &pos, file, ctx)) {
sysctl_head_finish(h);
break;
}
}
- ret = 1;
-out:
sysctl_head_finish(head);
- return ret;
+ return 0;
}
static int proc_sys_permission(struct inode *inode, int mask)
@@ -314,7 +702,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
if (!table) /* global root - r-xr-xr-x */
error = mask & MAY_WRITE ? -EACCES : 0;
else /* Use the permissions on the sysctl table entry */
- error = sysctl_perm(head->root, table, mask);
+ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
sysctl_head_finish(head);
return error;
@@ -329,10 +717,12 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
return -EPERM;
error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
+ if (error)
+ return error;
- return error;
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
}
static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -353,12 +743,16 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
}
static const struct file_operations proc_sys_file_operations = {
+ .open = proc_sys_open,
+ .poll = proc_sys_poll,
.read = proc_sys_read,
.write = proc_sys_write,
+ .llseek = default_llseek,
};
static const struct file_operations proc_sys_dir_file_operations = {
- .readdir = proc_sys_readdir,
+ .read = generic_read_dir,
+ .iterate = proc_sys_readdir,
.llseek = generic_file_llseek,
};
@@ -375,25 +769,51 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
{
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
return !PROC_I(dentry->d_inode)->sysctl->unregistering;
}
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
{
return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
}
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
- struct qstr *name)
+static int sysctl_is_seen(struct ctl_table_header *p)
{
- struct dentry *dentry = container_of(qstr, struct dentry, d_name);
- if (qstr->len != name->len)
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
+ unsigned int len, const char *str, const struct qstr *name)
+{
+ struct ctl_table_header *head;
+ struct inode *inode;
+
+ /* Although proc doesn't have negative dentries, rcu-walk means
+ * that inode here can be NULL */
+ /* AV: can it, indeed? */
+ inode = ACCESS_ONCE(dentry->d_inode);
+ if (!inode)
+ return 1;
+ if (name->len != len)
return 1;
- if (memcmp(qstr->name, name->name, name->len))
+ if (memcmp(name->name, str, len))
return 1;
- return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+ head = rcu_dereference(PROC_I(inode)->sysctl);
+ return !head || !sysctl_is_seen(head);
}
static const struct dentry_operations proc_sys_dentry_operations = {
@@ -402,6 +822,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {
.d_compare = proc_sys_compare,
};
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ entry = find_entry(&head, dir, name, namelen);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ if (!S_ISDIR(entry->mode))
+ return ERR_PTR(-ENOTDIR);
+ return container_of(head, struct ctl_dir, header);
+}
+
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+ const char *name, int namelen)
+{
+ struct ctl_table *table;
+ struct ctl_dir *new;
+ struct ctl_node *node;
+ char *new_name;
+
+ new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+ sizeof(struct ctl_table)*2 + namelen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ node = (struct ctl_node *)(new + 1);
+ table = (struct ctl_table *)(node + 1);
+ new_name = (char *)(table + 2);
+ memcpy(new_name, name, namelen);
+ new_name[namelen] = '\0';
+ table[0].procname = new_name;
+ table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+ init_header(&new->header, set->dir.header.root, set, node, table);
+
+ return new;
+}
+
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir: Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away. Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_set *set = dir->header.set;
+ struct ctl_dir *subdir, *new = NULL;
+ int err;
+
+ spin_lock(&sysctl_lock);
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ spin_unlock(&sysctl_lock);
+ new = new_dir(set, name, namelen);
+ spin_lock(&sysctl_lock);
+ subdir = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto failed;
+
+ /* Was the subdir added while we dropped the lock? */
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ /* Nope. Use the our freshly made directory entry. */
+ err = insert_header(dir, &new->header);
+ subdir = ERR_PTR(err);
+ if (err)
+ goto failed;
+ subdir = new;
+found:
+ subdir->header.nreg++;
+failed:
+ if (unlikely(IS_ERR(subdir))) {
+ pr_err("sysctl could not get directory: ");
+ sysctl_print_dir(dir);
+ pr_cont("/%*.*s %ld\n",
+ namelen, namelen, name, PTR_ERR(subdir));
+ }
+ drop_sysctl_table(&dir->header);
+ if (new)
+ drop_sysctl_table(&new->header);
+ spin_unlock(&sysctl_lock);
+ return subdir;
+}
+
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+ struct ctl_dir *parent;
+ const char *procname;
+ if (!dir->header.parent)
+ return &set->dir;
+ parent = xlate_dir(set, dir->header.parent);
+ if (IS_ERR(parent))
+ return parent;
+ procname = dir->header.ctl_table[0].procname;
+ return find_subdir(parent, procname, strlen(procname));
+}
+
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces)
+{
+ struct ctl_table_header *head;
+ struct ctl_table_root *root;
+ struct ctl_table_set *set;
+ struct ctl_table *entry;
+ struct ctl_dir *dir;
+ int ret;
+
+ ret = 0;
+ spin_lock(&sysctl_lock);
+ root = (*pentry)->data;
+ set = lookup_header_set(root, namespaces);
+ dir = xlate_dir(set, (*phead)->parent);
+ if (IS_ERR(dir))
+ ret = PTR_ERR(dir);
+ else {
+ const char *procname = (*pentry)->procname;
+ head = NULL;
+ entry = find_entry(&head, dir, procname, strlen(procname));
+ ret = -ENOENT;
+ if (entry && use_table(head)) {
+ unuse_table(*phead);
+ *phead = head;
+ *pentry = entry;
+ ret = 0;
+ }
+ }
+
+ spin_unlock(&sysctl_lock);
+ return ret;
+}
+
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("sysctl table check failed: %s/%s %pV\n",
+ path, table->procname, &vaf);
+
+ va_end(args);
+ return -EINVAL;
+}
+
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+ int err = 0;
+ for (; table->procname; table++) {
+ if (table->child)
+ err = sysctl_err(path, table, "Not a file");
+
+ if ((table->proc_handler == proc_dostring) ||
+ (table->proc_handler == proc_dointvec) ||
+ (table->proc_handler == proc_dointvec_minmax) ||
+ (table->proc_handler == proc_dointvec_jiffies) ||
+ (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (table->proc_handler == proc_dointvec_ms_jiffies) ||
+ (table->proc_handler == proc_doulongvec_minmax) ||
+ (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!table->data)
+ err = sysctl_err(path, table, "No data");
+ if (!table->maxlen)
+ err = sysctl_err(path, table, "No maxlen");
+ }
+ if (!table->proc_handler)
+ err = sysctl_err(path, table, "No proc_handler");
+
+ if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+ err = sysctl_err(path, table, "bogus .mode 0%o",
+ table->mode);
+ }
+ return err;
+}
+
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+ struct ctl_table_root *link_root)
+{
+ struct ctl_table *link_table, *entry, *link;
+ struct ctl_table_header *links;
+ struct ctl_node *node;
+ char *link_name;
+ int nr_entries, name_bytes;
+
+ name_bytes = 0;
+ nr_entries = 0;
+ for (entry = table; entry->procname; entry++) {
+ nr_entries++;
+ name_bytes += strlen(entry->procname) + 1;
+ }
+
+ links = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries +
+ sizeof(struct ctl_table)*(nr_entries + 1) +
+ name_bytes,
+ GFP_KERNEL);
+
+ if (!links)
+ return NULL;
+
+ node = (struct ctl_node *)(links + 1);
+ link_table = (struct ctl_table *)(node + nr_entries);
+ link_name = (char *)&link_table[nr_entries + 1];
+
+ for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ int len = strlen(entry->procname) + 1;
+ memcpy(link_name, entry->procname, len);
+ link->procname = link_name;
+ link->mode = S_IFLNK|S_IRWXUGO;
+ link->data = link_root;
+ link_name += len;
+ }
+ init_header(links, dir->header.root, dir->header.set, node, link_table);
+ links->nreg = nr_entries;
+
+ return links;
+}
+
+static bool get_links(struct ctl_dir *dir,
+ struct ctl_table *table, struct ctl_table_root *link_root)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry, *link;
+
+ /* Are there links available for every entry in table? */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ if (!link)
+ return false;
+ if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+ continue;
+ if (S_ISLNK(link->mode) && (link->data == link_root))
+ continue;
+ return false;
+ }
+
+ /* The checks passed. Increase the registration count on the links */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ head->nreg++;
+ }
+ return true;
+}
+
+static int insert_links(struct ctl_table_header *head)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_dir *core_parent = NULL;
+ struct ctl_table_header *links;
+ int err;
+
+ if (head->set == root_set)
+ return 0;
+
+ core_parent = xlate_dir(root_set, head->parent);
+ if (IS_ERR(core_parent))
+ return 0;
+
+ if (get_links(core_parent, head->ctl_table, head->root))
+ return 0;
+
+ core_parent->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ links = new_links(core_parent, head->ctl_table, head->root);
+
+ spin_lock(&sysctl_lock);
+ err = -ENOMEM;
+ if (!links)
+ goto out;
+
+ err = 0;
+ if (get_links(core_parent, head->ctl_table, head->root)) {
+ kfree(links);
+ goto out;
+ }
+
+ err = insert_header(core_parent, links);
+ if (err)
+ kfree(links);
+out:
+ drop_sysctl_table(&core_parent->header);
+ return err;
+}
+
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ * enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+ struct ctl_table_set *set,
+ const char *path, struct ctl_table *table)
+{
+ struct ctl_table_root *root = set->dir.header.root;
+ struct ctl_table_header *header;
+ const char *name, *nextname;
+ struct ctl_dir *dir;
+ struct ctl_table *entry;
+ struct ctl_node *node;
+ int nr_entries = 0;
+
+ for (entry = table; entry->procname; entry++)
+ nr_entries++;
+
+ header = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ if (!header)
+ return NULL;
+
+ node = (struct ctl_node *)(header + 1);
+ init_header(header, root, set, node, table);
+ if (sysctl_check_table(path, table))
+ goto fail;
+
+ spin_lock(&sysctl_lock);
+ dir = &set->dir;
+ /* Reference moved down the diretory tree get_subdir */
+ dir->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ /* Find the directory for the ctl_table */
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ goto fail;
+ }
+
+ spin_lock(&sysctl_lock);
+ if (insert_header(dir, header))
+ goto fail_put_dir_locked;
+
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+
+ return header;
+
+fail_put_dir_locked:
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+fail:
+ kfree(header);
+ dump_stack();
+ return NULL;
+}
+
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the table structure
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+ return __register_sysctl_table(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+
+static char *append_path(const char *path, char *pos, const char *name)
+{
+ int namelen;
+ namelen = strlen(name);
+ if (((pos - path) + namelen + 2) >= PATH_MAX)
+ return NULL;
+ memcpy(pos, name, namelen);
+ pos[namelen] = '/';
+ pos[namelen + 1] = '\0';
+ pos += namelen + 1;
+ return pos;
+}
+
+static int count_subheaders(struct ctl_table *table)
+{
+ int has_files = 0;
+ int nr_subheaders = 0;
+ struct ctl_table *entry;
+
+ /* special case: no directory and empty directory */
+ if (!table || !table->procname)
+ return 1;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_subheaders += count_subheaders(entry->child);
+ else
+ has_files = 1;
+ }
+ return nr_subheaders + has_files;
+}
+
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+ struct ctl_table_header ***subheader, struct ctl_table_set *set,
+ struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = NULL;
+ struct ctl_table *entry, *files;
+ int nr_files = 0;
+ int nr_dirs = 0;
+ int err = -ENOMEM;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_dirs++;
+ else
+ nr_files++;
+ }
+
+ files = table;
+ /* If there are mixed files and directories we need a new table */
+ if (nr_dirs && nr_files) {
+ struct ctl_table *new;
+ files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
+ GFP_KERNEL);
+ if (!files)
+ goto out;
+
+ ctl_table_arg = files;
+ for (new = files, entry = table; entry->procname; entry++) {
+ if (entry->child)
+ continue;
+ *new = *entry;
+ new++;
+ }
+ }
+
+ /* Register everything except a directory full of subdirectories */
+ if (nr_files || !nr_dirs) {
+ struct ctl_table_header *header;
+ header = __register_sysctl_table(set, path, files);
+ if (!header) {
+ kfree(ctl_table_arg);
+ goto out;
+ }
+
+ /* Remember if we need to free the file table */
+ header->ctl_table_arg = ctl_table_arg;
+ **subheader = header;
+ (*subheader)++;
+ }
+
+ /* Recurse into the subdirectories. */
+ for (entry = table; entry->procname; entry++) {
+ char *child_pos;
+
+ if (!entry->child)
+ continue;
+
+ err = -ENAMETOOLONG;
+ child_pos = append_path(path, pos, entry->procname);
+ if (!child_pos)
+ goto out;
+
+ err = register_leaf_sysctl_tables(path, child_pos, subheader,
+ set, entry->child);
+ pos[0] = '\0';
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ /* On failure our caller will unregister all registered subheaders */
+ return err;
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_set *set,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = table;
+ int nr_subheaders = count_subheaders(table);
+ struct ctl_table_header *header = NULL, **subheaders, **subheader;
+ const struct ctl_path *component;
+ char *new_path, *pos;
+
+ pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!new_path)
+ return NULL;
+
+ pos[0] = '\0';
+ for (component = path; component->procname; component++) {
+ pos = append_path(new_path, pos, component->procname);
+ if (!pos)
+ goto out;
+ }
+ while (table->procname && table->child && !table[1].procname) {
+ pos = append_path(new_path, pos, table->procname);
+ if (!pos)
+ goto out;
+ table = table->child;
+ }
+ if (nr_subheaders == 1) {
+ header = __register_sysctl_table(set, new_path, table);
+ if (header)
+ header->ctl_table_arg = ctl_table_arg;
+ } else {
+ header = kzalloc(sizeof(*header) +
+ sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+ if (!header)
+ goto out;
+
+ subheaders = (struct ctl_table_header **) (header + 1);
+ subheader = subheaders;
+ header->ctl_table_arg = ctl_table_arg;
+
+ if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+ set, table))
+ goto err_register_leaves;
+ }
+
+out:
+ kfree(new_path);
+ return header;
+
+err_register_leaves:
+ while (subheader > subheaders) {
+ struct ctl_table_header *subh = *(--subheader);
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ header = NULL;
+ goto out;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return __register_sysctl_paths(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+static void put_links(struct ctl_table_header *header)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_table_root *root = header->root;
+ struct ctl_dir *parent = header->parent;
+ struct ctl_dir *core_parent;
+ struct ctl_table *entry;
+
+ if (header->set == root_set)
+ return;
+
+ core_parent = xlate_dir(root_set, parent);
+ if (IS_ERR(core_parent))
+ return;
+
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ struct ctl_table_header *link_head;
+ struct ctl_table *link;
+ const char *name = entry->procname;
+
+ link = find_entry(&link_head, core_parent, name, strlen(name));
+ if (link &&
+ ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+ (S_ISLNK(link->mode) && (link->data == root)))) {
+ drop_sysctl_table(link_head);
+ }
+ else {
+ pr_err("sysctl link missing during unregister: ");
+ sysctl_print_dir(parent);
+ pr_cont("/%s\n", name);
+ }
+ }
+}
+
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+ struct ctl_dir *parent = header->parent;
+
+ if (--header->nreg)
+ return;
+
+ put_links(header);
+ start_unregistering(header);
+ if (!--header->count)
+ kfree_rcu(header, rcu);
+
+ if (parent)
+ drop_sysctl_table(&parent->header);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ int nr_subheaders;
+ might_sleep();
+
+ if (header == NULL)
+ return;
+
+ nr_subheaders = count_subheaders(header->ctl_table_arg);
+ if (unlikely(nr_subheaders > 1)) {
+ struct ctl_table_header **subheaders;
+ int i;
+
+ subheaders = (struct ctl_table_header **)(header + 1);
+ for (i = nr_subheaders -1; i >= 0; i--) {
+ struct ctl_table_header *subh = subheaders[i];
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ return;
+ }
+
+ spin_lock(&sysctl_lock);
+ drop_sysctl_table(header);
+ spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *set,
+ struct ctl_table_root *root,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ memset(set, 0, sizeof(*set));
+ set->is_seen = is_seen;
+ init_header(&set->dir.header, root, set, NULL, root_table);
+}
+
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
+
int __init proc_sys_init(void)
{
struct proc_dir_entry *proc_sys_root;
@@ -410,5 +1577,6 @@ int __init proc_sys_init(void)
proc_sys_root->proc_iops = &proc_sys_dir_operations;
proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return 0;
+
+ return sysctl_init();
}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc86943..cb761f01030 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
}
switch (p->type) {
case TTY_DRIVER_TYPE_SYSTEM:
- seq_printf(m, "system");
+ seq_puts(m, "system");
if (p->subtype == SYSTEM_TYPE_TTY)
- seq_printf(m, ":/dev/tty");
+ seq_puts(m, ":/dev/tty");
else if (p->subtype == SYSTEM_TYPE_SYSCONS)
- seq_printf(m, ":console");
+ seq_puts(m, ":console");
else if (p->subtype == SYSTEM_TYPE_CONSOLE)
- seq_printf(m, ":vtmaster");
+ seq_puts(m, ":vtmaster");
break;
case TTY_DRIVER_TYPE_CONSOLE:
- seq_printf(m, "console");
+ seq_puts(m, "console");
break;
case TTY_DRIVER_TYPE_SERIAL:
- seq_printf(m, "serial");
+ seq_puts(m, "serial");
break;
case TTY_DRIVER_TYPE_PTY:
if (p->subtype == PTY_TYPE_MASTER)
- seq_printf(m, "pty:master");
+ seq_puts(m, "pty:master");
else if (p->subtype == PTY_TYPE_SLAVE)
- seq_printf(m, "pty:slave");
+ seq_puts(m, "pty:slave");
else
- seq_printf(m, "pty");
+ seq_puts(m, "pty");
break;
default:
seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
/* pseudo-drivers first */
seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
- seq_printf(m, "system:/dev/tty\n");
+ seq_puts(m, "system:/dev/tty\n");
seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
- seq_printf(m, "system:console\n");
+ seq_puts(m, "system:console\n");
#ifdef CONFIG_UNIX98_PTYS
seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
- seq_printf(m, "system\n");
+ seq_puts(m, "system\n");
#endif
#ifdef CONFIG_VT
seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
- seq_printf(m, "system:vtmaster\n");
+ seq_puts(m, "system:vtmaster\n");
#endif
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e..5dbadecb234 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,8 +16,10 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/bitops.h>
+#include <linux/user_namespace.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -28,62 +30,116 @@ static int proc_test_super(struct super_block *sb, void *data)
static int proc_set_super(struct super_block *sb, void *data)
{
- struct pid_namespace *ns;
+ int err = set_anon_super(sb, NULL);
+ if (!err) {
+ struct pid_namespace *ns = (struct pid_namespace *)data;
+ sb->s_fs_info = get_pid_ns(ns);
+ }
+ return err;
+}
+
+enum {
+ Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_hidepid, "hidepid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ pid->pid_gid = make_kgid(current_user_ns(), option);
+ break;
+ case Opt_hidepid:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > 2) {
+ pr_err("proc: hidepid value must be between 0 and 2.\n");
+ return 0;
+ }
+ pid->hide_pid = option;
+ break;
+ default:
+ pr_err("proc: unrecognized mount option \"%s\" "
+ "or missing value\n", p);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct pid_namespace *pid = sb->s_fs_info;
- ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
- return set_anon_super(sb, NULL);
+ sync_filesystem(sb);
+ return !proc_parse_options(data, pid);
}
-static int proc_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *proc_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
int err;
struct super_block *sb;
struct pid_namespace *ns;
- struct proc_inode *ei;
-
- if (proc_mnt) {
- /* Seed the root directory with a pid so it doesn't need
- * to be special in base.c. I would do this earlier but
- * the only task alive when /proc is mounted the first time
- * is the init_task and it doesn't have any pids.
- */
- ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
- if (!ei->pid)
- ei->pid = find_get_pid(1);
- }
+ char *options;
- if (flags & MS_KERNMOUNT)
+ if (flags & MS_KERNMOUNT) {
ns = (struct pid_namespace *)data;
- else
- ns = current->nsproxy->pid_ns;
+ options = NULL;
+ } else {
+ ns = task_active_pid_ns(current);
+ options = data;
- sb = sget(fs_type, proc_test_super, proc_set_super, ns);
+ if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
+ return ERR_PTR(-EPERM);
+
+ /* Does the mounter have privilege over the pid namespace? */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ }
+
+ sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
if (IS_ERR(sb))
- return PTR_ERR(sb);
+ return ERR_CAST(sb);
+
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
if (!sb->s_root) {
- sb->s_flags = flags;
err = proc_fill_super(sb);
if (err) {
deactivate_locked_super(sb);
- return err;
- }
-
- ei = PROC_I(sb->s_root->d_inode);
- if (!ei->pid) {
- rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
- rcu_read_unlock();
+ return ERR_PTR(err);
}
sb->s_flags |= MS_ACTIVE;
- ns->proc_mnt = mnt;
}
- simple_set_mnt(mnt, sb);
- return 0;
+ return dget(sb->s_root);
}
static void proc_kill_sb(struct super_block *sb)
@@ -91,14 +147,17 @@ static void proc_kill_sb(struct super_block *sb)
struct pid_namespace *ns;
ns = (struct pid_namespace *)sb->s_fs_info;
+ if (ns->proc_self)
+ dput(ns->proc_self);
kill_anon_super(sb);
put_pid_ns(ns);
}
static struct file_system_type proc_fs_type = {
.name = "proc",
- .get_sb = proc_get_sb,
+ .mount = proc_mount,
.kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -109,13 +168,8 @@ void __init proc_root_init(void)
err = register_filesystem(&proc_fs_type);
if (err)
return;
- proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- err = PTR_ERR(proc_mnt);
- if (IS_ERR(proc_mnt)) {
- unregister_filesystem(&proc_fs_type);
- return;
- }
+ proc_self_init();
proc_symlink("mounts", NULL, "self/mounts");
proc_net_init();
@@ -131,9 +185,6 @@ void __init proc_root_init(void)
proc_mkdir("openprom", NULL);
#endif
proc_tty_init();
-#ifdef CONFIG_PROC_DEVICETREE
- proc_device_tree_init();
-#endif
proc_mkdir("bus", NULL);
proc_sys_init();
}
@@ -146,30 +197,24 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
return 0;
}
-static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_lookup(dir, dentry, nd)) {
+ if (!proc_lookup(dir, dentry, flags))
return NULL;
- }
- return proc_pid_lookup(dir, dentry, nd);
+ return proc_pid_lookup(dir, dentry, flags);
}
-static int proc_root_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
{
- unsigned int nr = filp->f_pos;
- int ret;
-
- if (nr < FIRST_PROCESS_ENTRY) {
- int error = proc_readdir(filp, dirent, filldir);
- if (error <= 0)
+ if (ctx->pos < FIRST_PROCESS_ENTRY) {
+ int error = proc_readdir(file, ctx);
+ if (unlikely(error <= 0))
return error;
- filp->f_pos = FIRST_PROCESS_ENTRY;
+ ctx->pos = FIRST_PROCESS_ENTRY;
}
- ret = proc_pid_readdir(filp, dirent, filldir);
- return ret;
+ return proc_pid_readdir(file, ctx);
}
/*
@@ -179,7 +224,8 @@ static int proc_root_readdir(struct file * filp,
*/
static const struct file_operations proc_root_operations = {
.read = generic_read_dir,
- .readdir = proc_root_readdir,
+ .iterate = proc_root_readdir,
+ .llseek = default_llseek,
};
/*
@@ -196,13 +242,13 @@ static const struct inode_operations proc_root_inode_operations = {
struct proc_dir_entry proc_root = {
.low_ino = PROC_ROOT_INO,
.namelen = 5,
- .name = "/proc",
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
.nlink = 2,
.count = ATOMIC_INIT(1),
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
+ .name = "/proc",
};
int pid_ns_prepare_proc(struct pid_namespace *ns)
@@ -213,16 +259,11 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
if (IS_ERR(mnt))
return PTR_ERR(mnt);
+ ns->proc_mnt = mnt;
return 0;
}
void pid_ns_release_proc(struct pid_namespace *ns)
{
- mntput(ns->proc_mnt);
+ kern_unmount(ns->proc_mnt);
}
-
-EXPORT_SYMBOL(proc_symlink);
-EXPORT_SYMBOL(proc_mkdir);
-EXPORT_SYMBOL(create_proc_entry);
-EXPORT_SYMBOL(proc_create_data);
-EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 00000000000..4348bb8907c
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,84 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+ int buflen)
+{
+ struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ char tmp[PROC_NUMBUF];
+ if (!tgid)
+ return -ENOENT;
+ sprintf(tmp, "%d", tgid);
+ return readlink_copy(buffer, buflen, tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ char *name = ERR_PTR(-ENOENT);
+ if (tgid) {
+ /* 11 for max length of signed int in decimal + NULL term */
+ name = kmalloc(12, GFP_KERNEL);
+ if (!name)
+ name = ERR_PTR(-ENOMEM);
+ else
+ sprintf(name, "%d", tgid);
+ }
+ nd_set_link(nd, name);
+ return NULL;
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+ .readlink = proc_self_readlink,
+ .follow_link = proc_self_follow_link,
+ .put_link = kfree_put_link,
+};
+
+static unsigned self_inum;
+
+int proc_setup_self(struct super_block *s)
+{
+ struct inode *root_inode = s->s_root->d_inode;
+ struct pid_namespace *ns = s->s_fs_info;
+ struct dentry *self;
+
+ mutex_lock(&root_inode->i_mutex);
+ self = d_alloc_name(s->s_root, "self");
+ if (self) {
+ struct inode *inode = new_inode_pseudo(s);
+ if (inode) {
+ inode->i_ino = self_inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_op = &proc_self_inode_operations;
+ d_add(self, inode);
+ } else {
+ dput(self);
+ self = ERR_PTR(-ENOMEM);
+ }
+ } else {
+ self = ERR_PTR(-ENOMEM);
+ }
+ mutex_unlock(&root_inode->i_mutex);
+ if (IS_ERR(self)) {
+ pr_err("proc_fill_super: can't allocate /proc/self\n");
+ return PTR_ERR(self);
+ }
+ ns->proc_self = self;
+ return 0;
+}
+
+void __init proc_self_init(void)
+{
+ proc_alloc_inum(&self_inum);
+}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 1807c2419f1..ad8a77f94be 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
{
int i, j;
- seq_printf(p, " ");
+ seq_puts(p, " ");
for_each_possible_cpu(i)
seq_printf(p, "CPU%-8d", i);
- seq_printf(p, "\n");
+ seq_putc(p, '\n');
for (i = 0; i < NR_SOFTIRQS; i++) {
- seq_printf(p, "%8s:", softirq_to_name[i]);
+ seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
- seq_printf(p, "\n");
+ seq_putc(p, '\n');
}
return 0;
}
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
return 0;
}
-module_init(proc_softirqs_init);
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003..bf2d03f8fd3 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
#include <linux/cpumask.h>
#include <linux/fs.h>
-#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
@@ -10,7 +9,8 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/irqnr.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
+#include <linux/tick.h>
#ifndef arch_irq_stat_cpu
#define arch_irq_stat_cpu(cpu) 0
@@ -18,44 +18,94 @@
#ifndef arch_irq_stat
#define arch_irq_stat() 0
#endif
-#ifndef arch_idle_time
-#define arch_idle_time(cpu) 0
+
+#ifdef arch_idle_time
+
+static cputime64_t get_idle_time(int cpu)
+{
+ cputime64_t idle;
+
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+ idle += arch_idle_time(cpu);
+ return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+ cputime64_t iowait;
+
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+ iowait += arch_idle_time(cpu);
+ return iowait;
+}
+
+#else
+
+static u64 get_idle_time(int cpu)
+{
+ u64 idle, idle_time = -1ULL;
+
+ if (cpu_online(cpu))
+ idle_time = get_cpu_idle_time_us(cpu, NULL);
+
+ if (idle_time == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ else
+ idle = usecs_to_cputime64(idle_time);
+
+ return idle;
+}
+
+static u64 get_iowait_time(int cpu)
+{
+ u64 iowait, iowait_time = -1ULL;
+
+ if (cpu_online(cpu))
+ iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+
+ if (iowait_time == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ else
+ iowait = usecs_to_cputime64(iowait_time);
+
+ return iowait;
+}
+
#endif
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
unsigned long jif;
- cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
- cputime64_t guest, guest_nice;
+ u64 user, nice, system, idle, iowait, irq, softirq, steal;
+ u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
- unsigned int per_irq_sum;
user = nice = system = idle = iowait =
- irq = softirq = steal = cputime64_zero;
- guest = guest_nice = cputime64_zero;
+ irq = softirq = steal = 0;
+ guest = guest_nice = 0;
getboottime(&boottime);
jif = boottime.tv_sec;
for_each_possible_cpu(i) {
- user = cputime64_add(user, kstat_cpu(i).cpustat.user);
- nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
- system = cputime64_add(system, kstat_cpu(i).cpustat.system);
- idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
- idle = cputime64_add(idle, arch_idle_time(i));
- iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
- irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
- softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
- steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
- guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
- guest_nice = cputime64_add(guest_nice,
- kstat_cpu(i).cpustat.guest_nice);
- for_each_irq_nr(j) {
- sum += kstat_irqs_cpu(j, i);
- }
+ user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(i);
+ iowait += get_iowait_time(i);
+ irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
@@ -67,57 +117,49 @@ static int show_stat(struct seq_file *p, void *v)
}
sum += arch_irq_stat();
- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
- for_each_online_cpu(i) {
+ seq_puts(p, "cpu ");
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
+ for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kstat_cpu(i).cpustat.user;
- nice = kstat_cpu(i).cpustat.nice;
- system = kstat_cpu(i).cpustat.system;
- idle = kstat_cpu(i).cpustat.idle;
- idle = cputime64_add(idle, arch_idle_time(i));
- iowait = kstat_cpu(i).cpustat.iowait;
- irq = kstat_cpu(i).cpustat.irq;
- softirq = kstat_cpu(i).cpustat.softirq;
- steal = kstat_cpu(i).cpustat.steal;
- guest = kstat_cpu(i).cpustat.guest;
- guest_nice = kstat_cpu(i).cpustat.guest_nice;
- seq_printf(p,
- "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- i,
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(i);
+ iowait = get_iowait_time(i);
+ irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
/* sum again ? it could be updated? */
- for_each_irq_nr(j) {
- per_irq_sum = 0;
- for_each_possible_cpu(i)
- per_irq_sum += kstat_irqs_cpu(j, i);
-
- seq_printf(p, " %u", per_irq_sum);
- }
+ for_each_irq_nr(j)
+ seq_put_decimal_ull(p, ' ', kstat_irqs(j));
seq_printf(p,
"\nctxt %llu\n"
@@ -134,34 +176,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
for (i = 0; i < NR_SOFTIRQS; i++)
- seq_printf(p, " %u", per_softirq_sums[i]);
- seq_printf(p, "\n");
+ seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
+ seq_putc(p, '\n');
return 0;
}
static int stat_open(struct inode *inode, struct file *file)
{
- unsigned size = 4096 * (1 + num_possible_cpus() / 32);
- char *buf;
- struct seq_file *m;
- int res;
-
- /* don't ask for more than the kmalloc() max size, currently 128 KB */
- if (size > 128 * 1024)
- size = 128 * 1024;
- buf = kmalloc(size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- res = single_open(file, show_stat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
+ size_t size = 1024 + 128 * num_online_cpus();
+
+ /* minimum size to display an interrupt count : 2 bytes */
+ size += 2 * nr_irqs;
+ return single_open_size(file, show_stat, NULL, size);
}
static const struct file_operations proc_stat_operations = {
@@ -176,4 +203,4 @@ static int __init proc_stat_init(void)
proc_create("stat", 0, NULL, &proc_stat_operations);
return 0;
}
-module_init(proc_stat_init);
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f277c4a111c..cfa63ee92c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,13 +1,18 @@
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
+#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
+#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -16,7 +21,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib;
+ unsigned long data, text, lib, swap;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
@@ -36,25 +41,31 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+ swap = get_mm_counter(mm, MM_SWAPENTS);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n"
+ "VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
- "VmPTE:\t%8lu kB\n",
+ "VmPTE:\t%8lu kB\n"
+ "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
- (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+ total_vm << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
+ mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+ (PTRS_PER_PTE * sizeof(pte_t) *
+ atomic_long_read(&mm->nr_ptes)) >> 10,
+ swap << (PAGE_SHIFT-10));
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -62,29 +73,67 @@ unsigned long task_vsize(struct mm_struct *mm)
return PAGE_SIZE * mm->total_vm;
}
-int task_statm(struct mm_struct *mm, int *shared, int *text,
- int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, file_rss);
+ *shared = get_mm_counter(mm, MM_FILEPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
- *resident = *shared + get_mm_counter(mm, anon_rss);
+ *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
-static void pad_len_spaces(struct seq_file *m, int len)
+#ifdef CONFIG_NUMA
+/*
+ * These functions are for numa_maps but called in generic **maps seq_file
+ * ->start(), ->stop() ops.
+ *
+ * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
+ * Each mempolicy object is controlled by reference counting. The problem here
+ * is how to avoid accessing dead mempolicy object.
+ *
+ * Because we're holding mmap_sem while reading seq_file, it's safe to access
+ * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
+ *
+ * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
+ * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
+ * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
+ * gurantee the task never exits under us. But taking task_lock() around
+ * get_vma_plicy() causes lock order problem.
+ *
+ * To access task->mempolicy without lock, we hold a reference count of an
+ * object pointed by task->mempolicy and remember it. This will guarantee
+ * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ */
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+ struct task_struct *task = priv->task;
+
+ task_lock(task);
+ priv->task_mempolicy = task->mempolicy;
+ mpol_get(priv->task_mempolicy);
+ task_unlock(task);
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
{
- len = 25 + sizeof(void*) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
+ mpol_put(priv->task_mempolicy);
}
+#else
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+#endif
static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
if (vma && vma != priv->tail_vma) {
struct mm_struct *mm = vma->vm_mm;
+ release_task_mempolicy(priv);
up_read(&mm->mmap_sem);
mmput(mm);
}
@@ -104,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/*
* We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
+ * vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
@@ -114,16 +163,16 @@ static void *m_start(struct seq_file *m, loff_t *pos)
priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
if (!priv->task)
- return NULL;
+ return ERR_PTR(-ESRCH);
- mm = mm_for_maps(priv->task);
- if (!mm)
- return NULL;
+ mm = mm_access(priv->task, PTRACE_MODE_READ);
+ if (!mm || IS_ERR(mm))
+ return mm;
down_read(&mm->mmap_sem);
- tail_vma = get_gate_vma(priv->task);
+ tail_vma = get_gate_vma(priv->task->mm);
priv->tail_vma = tail_vma;
-
+ hold_task_mempolicy(priv);
/* Start with last addr hint */
vma = find_vma(mm, last_addr);
if (last_addr && vma) {
@@ -150,6 +199,7 @@ out:
if (vma)
return vma;
+ release_task_mempolicy(priv);
/* End of vmas has been reached */
m->version = (tail_vma != NULL)? 0: -1UL;
up_read(&mm->mmap_sem);
@@ -175,7 +225,8 @@ static void m_stop(struct seq_file *m, void *v)
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
- vma_stop(priv, vma);
+ if (!IS_ERR(vma))
+ vma_stop(priv, vma);
if (priv->task)
put_task_struct(priv->task);
}
@@ -199,109 +250,160 @@ static int do_maps_open(struct inode *inode, struct file *file,
return ret;
}
-static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
- int flags = vma->vm_flags;
+ struct proc_maps_private *priv = m->private;
+ struct task_struct *task = priv->task;
+ vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
+ unsigned long start, end;
dev_t dev = 0;
- int len;
+ const char *name = NULL;
if (file) {
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
}
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
- vma->vm_start,
- vma->vm_end,
+ /* We don't show the stack guard page in /proc/maps */
+ start = vma->vm_start;
+ if (stack_guard_page_start(vma, start))
+ start += PAGE_SIZE;
+ end = vma->vm_end;
+ if (stack_guard_page_end(vma, end))
+ end -= PAGE_SIZE;
+
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ start,
+ end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
pgoff,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
/*
* Print the dentry name for named mappings, and a
* special [heap] marker for the heap:
*/
if (file) {
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "\n");
- } else {
- const char *name = arch_vma_name(vma);
- if (!name) {
- if (mm) {
- if (vma->vm_start <= mm->start_brk &&
- vma->vm_end >= mm->brk) {
- name = "[heap]";
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- name = "[stack]";
- } else {
- unsigned long stack_start;
- struct proc_maps_private *pmp;
-
- pmp = m->private;
- stack_start = pmp->task->stack_start;
-
- if (vma->vm_start <= stack_start &&
- vma->vm_end >= stack_start) {
- pad_len_spaces(m, len);
- seq_printf(m,
- "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
- vma->vm_end - stack_start
-#else
- stack_start - vma->vm_start
-#endif
- );
- }
- }
+ goto done;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = vma->vm_ops->name(vma);
+ if (name)
+ goto done;
+ }
+
+ name = arch_vma_name(vma);
+ if (!name) {
+ pid_t tid;
+
+ if (!mm) {
+ name = "[vdso]";
+ goto done;
+ }
+
+ if (vma->vm_start <= mm->brk &&
+ vma->vm_end >= mm->start_brk) {
+ name = "[heap]";
+ goto done;
+ }
+
+ tid = vm_is_stack(task, vma, is_pid);
+
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack)) {
+ name = "[stack]";
} else {
- name = "[vdso]";
+ /* Thread stack in /proc/PID/maps */
+ seq_pad(m, ' ');
+ seq_printf(m, "[stack:%d]", tid);
}
}
- if (name) {
- pad_len_spaces(m, len);
- seq_puts(m, name);
- }
+ }
+
+done:
+ if (name) {
+ seq_pad(m, ' ');
+ seq_puts(m, name);
}
seq_putc(m, '\n');
}
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
+ m->version = (vma != get_gate_vma(task->mm))
+ ? vma->vm_start : 0;
return 0;
}
+static int show_pid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 0);
+}
+
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static const struct seq_operations proc_tid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_maps_op);
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -335,61 +437,151 @@ struct mem_size_stats {
unsigned long private_clean;
unsigned long private_dirty;
unsigned long referenced;
+ unsigned long anonymous;
+ unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long nonlinear;
u64 pss;
};
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+
+static void smaps_pte_entry(pte_t ptent, unsigned long addr,
+ unsigned long ptent_size, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
- pte_t *pte, ptent;
- spinlock_t *ptl;
- struct page *page;
+ pgoff_t pgoff = linear_page_index(vma, addr);
+ struct page *page = NULL;
int mapcount;
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
-
- if (is_swap_pte(ptent)) {
- mss->swap += PAGE_SIZE;
- continue;
- }
+ if (pte_present(ptent)) {
+ page = vm_normal_page(vma, addr, ptent);
+ } else if (is_swap_pte(ptent)) {
+ swp_entry_t swpent = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(swpent))
+ mss->swap += ptent_size;
+ else if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ } else if (pte_file(ptent)) {
+ if (pte_to_pgoff(ptent) != pgoff)
+ mss->nonlinear += ptent_size;
+ }
- if (!pte_present(ptent))
- continue;
+ if (!page)
+ return;
+
+ if (PageAnon(page))
+ mss->anonymous += ptent_size;
+
+ if (page->index != pgoff)
+ mss->nonlinear += ptent_size;
+
+ mss->resident += ptent_size;
+ /* Accumulate the size in pages that have been accessed. */
+ if (pte_young(ptent) || PageReferenced(page))
+ mss->referenced += ptent_size;
+ mapcount = page_mapcount(page);
+ if (mapcount >= 2) {
+ if (pte_dirty(ptent) || PageDirty(page))
+ mss->shared_dirty += ptent_size;
+ else
+ mss->shared_clean += ptent_size;
+ mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
+ } else {
+ if (pte_dirty(ptent) || PageDirty(page))
+ mss->private_dirty += ptent_size;
+ else
+ mss->private_clean += ptent_size;
+ mss->pss += (ptent_size << PSS_SHIFT);
+ }
+}
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
- continue;
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = mss->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
- mss->resident += PAGE_SIZE;
- /* Accumulate the size in pages that have been accessed. */
- if (pte_young(ptent) || PageReferenced(page))
- mss->referenced += PAGE_SIZE;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- if (pte_dirty(ptent))
- mss->shared_dirty += PAGE_SIZE;
- else
- mss->shared_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
- } else {
- if (pte_dirty(ptent))
- mss->private_dirty += PAGE_SIZE;
- else
- mss->private_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT);
- }
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
+ spin_unlock(ptl);
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ return 0;
}
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ /*
+ * The mmap_sem held all the way back in m_start() is what
+ * keeps khugepaged out of here and from collapsing things
+ * in here.
+ */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
-static int show_smap(struct seq_file *m, void *v)
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+ /*
+ * Don't forget to update Documentation/ on changes.
+ */
+ static const char mnemonics[BITS_PER_LONG][2] = {
+ /*
+ * In case if we meet a flag we don't know about.
+ */
+ [0 ... (BITS_PER_LONG-1)] = "??",
+
+ [ilog2(VM_READ)] = "rd",
+ [ilog2(VM_WRITE)] = "wr",
+ [ilog2(VM_EXEC)] = "ex",
+ [ilog2(VM_SHARED)] = "sh",
+ [ilog2(VM_MAYREAD)] = "mr",
+ [ilog2(VM_MAYWRITE)] = "mw",
+ [ilog2(VM_MAYEXEC)] = "me",
+ [ilog2(VM_MAYSHARE)] = "ms",
+ [ilog2(VM_GROWSDOWN)] = "gd",
+ [ilog2(VM_PFNMAP)] = "pf",
+ [ilog2(VM_DENYWRITE)] = "dw",
+ [ilog2(VM_LOCKED)] = "lo",
+ [ilog2(VM_IO)] = "io",
+ [ilog2(VM_SEQ_READ)] = "sr",
+ [ilog2(VM_RAND_READ)] = "rr",
+ [ilog2(VM_DONTCOPY)] = "dc",
+ [ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_ACCOUNT)] = "ac",
+ [ilog2(VM_NORESERVE)] = "nr",
+ [ilog2(VM_HUGETLB)] = "ht",
+ [ilog2(VM_NONLINEAR)] = "nl",
+ [ilog2(VM_ARCH_1)] = "ar",
+ [ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ [ilog2(VM_SOFTDIRTY)] = "sd",
+#endif
+ [ilog2(VM_MIXEDMAP)] = "mm",
+ [ilog2(VM_HUGEPAGE)] = "hg",
+ [ilog2(VM_NOHUGEPAGE)] = "nh",
+ [ilog2(VM_MERGEABLE)] = "mg",
+ };
+ size_t i;
+
+ seq_puts(m, "VmFlags: ");
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ if (vma->vm_flags & (1UL << i)) {
+ seq_printf(m, "%c%c ",
+ mnemonics[i][0], mnemonics[i][1]);
+ }
+ }
+ seq_putc(m, '\n');
+}
+
+static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
@@ -403,10 +595,11 @@ static int show_smap(struct seq_file *m, void *v)
memset(&mss, 0, sizeof mss);
mss.vma = vma;
+ /* mmap_sem is held in m_start */
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
seq_printf(m,
"Size: %8lu kB\n"
@@ -417,9 +610,12 @@ static int show_smap(struct seq_file *m, void *v)
"Private_Clean: %8lu kB\n"
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n",
+ "MMUPageSize: %8lu kB\n"
+ "Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -428,45 +624,151 @@ static int show_smap(struct seq_file *m, void *v)
mss.private_clean >> 10,
mss.private_dirty >> 10,
mss.referenced >> 10,
+ mss.anonymous >> 10,
+ mss.anonymous_thp >> 10,
mss.swap >> 10,
vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10);
+ vma_mmu_pagesize(vma) >> 10,
+ (vma->vm_flags & VM_LOCKED) ?
+ (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+
+ if (vma->vm_flags & VM_NONLINEAR)
+ seq_printf(m, "Nonlinear: %8lu kB\n",
+ mss.nonlinear >> 10);
+
+ show_smap_vma_flags(m, vma);
if (m->count < m->size) /* vma is copied successfully */
- m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+ m->version = (vma != get_gate_vma(task->mm))
+ ? vma->vm_start : 0;
return 0;
}
+static int show_pid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 1);
+}
+
+static int show_tid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 0);
+}
+
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_smap
+ .show = show_pid_smap
+};
+
+static const struct seq_operations proc_tid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_smap
};
-static int smaps_open(struct inode *inode, struct file *file)
+static int pid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
-const struct file_operations proc_smaps_operations = {
- .open = smaps_open,
+static int tid_smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_smaps_op);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+ .open = pid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
+const struct file_operations proc_tid_smaps_operations = {
+ .open = tid_smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ * but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ * these flag is set to denote, that user is aware of the
+ * new API and those page-shift bits change their meaning.
+ * The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ * of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/vm/soft-dirty.txt for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+
+ if (pte_present(ptent)) {
+ ptent = pte_wrprotect(ptent);
+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_clear_soft_dirty(ptent);
+ } else if (pte_file(ptent)) {
+ ptent = pte_file_clear_soft_dirty(ptent);
+ }
+
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
+
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
if (!pte_present(ptent))
continue;
@@ -483,10 +785,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -494,29 +792,47 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF];
struct mm_struct *mm;
struct vm_area_struct *vma;
- long type;
+ enum clear_refs_types type;
+ int itype;
+ int rv;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- if (strict_strtol(strstrip(buffer), 10, &type))
- return -EINVAL;
- if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
+ if (rv < 0)
+ return rv;
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- task = get_proc_task(file->f_path.dentry->d_inode);
+
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ soft_dirty_cleared = true;
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
+ " See the linux/Documentation/vm/pagemap.txt for "
+ "details.\n");
+ }
+
+ task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
+ struct clear_refs_private cp = {
+ .type = type,
+ };
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
+ .private = &cp,
};
down_read(&mm->mmap_sem);
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_start(mm, 0, -1);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- clear_refs_walk.private = vma;
+ cp.vma = vma;
if (is_vm_hugetlb_page(vma))
continue;
/*
@@ -527,14 +843,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
*
* Writing 3 to /proc/pid/clear_refs only affects file
* mapped pages.
+ *
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
*/
if (type == CLEAR_REFS_ANON && vma->vm_file)
continue;
if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
continue;
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ }
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
+ if (type == CLEAR_REFS_SOFT_DIRTY)
+ mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
@@ -546,13 +870,23 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
const struct file_operations proc_clear_refs_operations = {
.write = clear_refs_write,
+ .llseek = noop_llseek,
};
+typedef struct {
+ u64 pme;
+} pagemap_entry_t;
+
struct pagemapread {
- u64 __user *out, *end;
+ int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
+ pagemap_entry_t *buffer;
+ bool v2;
};
-#define PM_ENTRY_BYTES sizeof(u64)
+#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
+
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -560,22 +894,29 @@ struct pagemapread {
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
-#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
+#define PM_FILE PM_STATUS(1LL)
+#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
-static int add_to_pagemap(unsigned long addr, u64 pfn,
+static inline pagemap_entry_t make_pme(u64 val)
+{
+ return (pagemap_entry_t) { .pme = val };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
struct pagemapread *pm)
{
- if (put_user(pfn, pm->out))
- return -EFAULT;
- pm->out++;
- if (pm->out >= pm->end)
+ pm->buffer[pm->pos++] = *pme;
+ if (pm->pos >= pm->len)
return PM_END_OF_BUFFER;
return 0;
}
@@ -586,60 +927,136 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
return err;
}
-static u64 swap_pte_to_pagemap_entry(pte_t pte)
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- swp_entry_t e = pte_to_swp_entry(pte);
- return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+ u64 frame, flags;
+ struct page *page = NULL;
+ int flags2 = 0;
+
+ if (pte_present(pte)) {
+ frame = pte_pfn(pte);
+ flags = PM_PRESENT;
+ page = vm_normal_page(vma, addr, pte);
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t entry;
+ if (pte_swp_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
+ entry = pte_to_swp_entry(pte);
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ flags = PM_SWAP;
+ if (is_migration_entry(entry))
+ page = migration_entry_to_page(entry);
+ } else {
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags2 |= __PM_SOFT_DIRTY;
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+ return;
+ }
+
+ if (page && !PageAnon(page))
+ flags |= PM_FILE;
+ if ((vma->vm_flags & VM_SOFTDIRTY))
+ flags2 |= __PM_SOFT_DIRTY;
+
+ *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
}
-static u64 pte_to_pagemap_entry(pte_t pte)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
{
- u64 pme = 0;
- if (is_swap_pte(pte))
- pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
- else if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
+ /*
+ * Currently pmd for thp is always present because thp can not be
+ * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd))
+ *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
+ | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
+ else
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
}
+#else
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pmd_t pmd, int offset, int pmd_flags2)
+{
+}
+#endif
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma;
struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
pte_t *pte;
int err = 0;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ int pmd_flags2;
+
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
+ pmd_flags2 = __PM_SOFT_DIRTY;
+ else
+ pmd_flags2 = 0;
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ unsigned long offset;
+
+ offset = (addr & ~PAGEMAP_WALK_MASK) >>
+ PAGE_SHIFT;
+ thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ break;
+ }
+ spin_unlock(ptl);
+ return err;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- u64 pfn = PM_NOT_PRESENT;
+ int flags2;
/* check to see if we've left 'vma' behind
* and need a new, higher one */
- if (vma && (addr >= vma->vm_end))
+ if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ flags2 = __PM_SOFT_DIRTY;
+ else
+ flags2 = 0;
+ pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
+ }
/* check that 'vma' actually covers this address,
* and that it isn't a huge page vma */
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pfn = pte_to_pagemap_entry(*pte);
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
- err = add_to_pagemap(addr, pfn, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -649,41 +1066,42 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+#ifdef CONFIG_HUGETLB_PAGE
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+ pte_t pte, int offset, int flags2)
{
- u64 pme = 0;
if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
+ *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
+ PM_STATUS2(pm->v2, flags2) |
+ PM_PRESENT);
+ else
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
+ PM_STATUS2(pm->v2, flags2));
}
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- struct vm_area_struct *vma;
struct pagemapread *pm = walk->private;
- struct hstate *hs = NULL;
+ struct vm_area_struct *vma;
int err = 0;
+ int flags2;
+ pagemap_entry_t pme;
vma = find_vma(walk->mm, addr);
- if (vma)
- hs = hstate_vma(vma);
- for (; addr != end; addr += PAGE_SIZE) {
- u64 pfn = PM_NOT_PRESENT;
+ WARN_ON_ONCE(!vma);
- if (vma && (addr >= vma->vm_end)) {
- vma = find_vma(walk->mm, addr);
- if (vma)
- hs = hstate_vma(vma);
- }
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ flags2 = __PM_SOFT_DIRTY;
+ else
+ flags2 = 0;
- if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
- /* calculate pfn of the "raw" page in the hugepage. */
- int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
- pfn = huge_pte_to_pagemap_entry(*pte, offset);
- }
- err = add_to_pagemap(addr, pfn, pm);
+ for (; addr != end; addr += PAGE_SIZE) {
+ int offset = (addr & ~hmask) >> PAGE_SHIFT;
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -692,6 +1110,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
return err;
}
+#endif /* HUGETLB_PAGE */
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -699,11 +1118,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
* For each page in the address space, this file contains one 64-bit entry
* consisting of the following:
*
- * Bits 0-55 page frame number (PFN) if present
+ * Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
- * Bits 5-55 swap offset if swapped
+ * Bits 5-54 swap offset if swapped
* Bits 55-60 page shift (page size = 1<<page shift)
- * Bit 61 reserved for future use
+ * Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
*
@@ -720,72 +1139,46 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- struct page **pages, *page;
- unsigned long uaddr, uend;
+ struct task_struct *task = get_proc_task(file_inode(file));
struct mm_struct *mm;
struct pagemapread pm;
- int pagecount;
int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
+ int copied = 0;
if (!task)
goto out;
- ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto out_task;
-
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
goto out_task;
ret = 0;
-
if (!count)
goto out_task;
- mm = get_task_mm(task);
- if (!mm)
- goto out_task;
-
-
- uaddr = (unsigned long)buf & PAGE_MASK;
- uend = (unsigned long)(buf + count);
- pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
- ret = 0;
- if (pagecount == 0)
- goto out_mm;
- pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+ pm.v2 = soft_dirty_cleared;
+ pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+ pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
- if (!pages)
- goto out_mm;
-
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(current, current->mm, uaddr, pagecount,
- 1, 0, pages, NULL);
- up_read(&current->mm->mmap_sem);
+ if (!pm.buffer)
+ goto out_task;
- if (ret < 0)
+ mm = mm_access(task, PTRACE_MODE_READ);
+ ret = PTR_ERR(mm);
+ if (!mm || IS_ERR(mm))
goto out_free;
- if (ret != pagecount) {
- pagecount = ret;
- ret = -EFAULT;
- goto out_pages;
- }
-
- pm.out = (u64 __user *)buf;
- pm.end = (u64 __user *)(buf + count);
-
pagemap_walk.pmd_entry = pagemap_pte_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
pagemap_walk.mm = mm;
pagemap_walk.private = &pm;
@@ -804,56 +1197,350 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
* user buffer is tracked in "pm", and the walk
* will stop when we hit the end of the buffer.
*/
- ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
- if (ret == PM_END_OF_BUFFER)
- ret = 0;
- /* don't need mmap_sem for these, but this looks cleaner */
- *ppos += (char __user *)pm.out - buf;
- if (!ret)
- ret = (char __user *)pm.out - buf;
-
-out_pages:
- for (; pagecount; pagecount--) {
- page = pages[pagecount-1];
- if (!PageReserved(page))
- SetPageDirty(page);
- page_cache_release(page);
+ ret = 0;
+ while (count && (start_vaddr < end_vaddr)) {
+ int len;
+ unsigned long end;
+
+ pm.pos = 0;
+ end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+ /* overflow ? */
+ if (end < start_vaddr || end > end_vaddr)
+ end = end_vaddr;
+ down_read(&mm->mmap_sem);
+ ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+ up_read(&mm->mmap_sem);
+ start_vaddr = end;
+
+ len = min(count, PM_ENTRY_BYTES * pm.pos);
+ if (copy_to_user(buf, pm.buffer, len)) {
+ ret = -EFAULT;
+ goto out_mm;
+ }
+ copied += len;
+ buf += len;
+ count -= len;
}
-out_free:
- kfree(pages);
+ *ppos += copied;
+ if (!ret || ret == PM_END_OF_BUFFER)
+ ret = copied;
+
out_mm:
mmput(mm);
+out_free:
+ kfree(pm.buffer);
out_task:
put_task_struct(task);
out:
return ret;
}
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+ pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+ "to stop being page-shift some time soon. See the "
+ "linux/Documentation/vm/pagemap.txt for details.\n");
+ return 0;
+}
+
const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
+ .open = pagemap_open,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+
+struct numa_maps {
+ struct vm_area_struct *vma;
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long active;
+ unsigned long writeback;
+ unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
+ unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+ struct proc_maps_private proc_maps;
+ struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+ unsigned long nr_pages)
+{
+ int count = page_mapcount(page);
+
+ md->pages += nr_pages;
+ if (pte_dirty || PageDirty(page))
+ md->dirty += nr_pages;
+
+ if (PageSwapCache(page))
+ md->swapcache += nr_pages;
+
+ if (PageActive(page) || PageUnevictable(page))
+ md->active += nr_pages;
+
+ if (PageWriteback(page))
+ md->writeback += nr_pages;
+
+ if (PageAnon(page))
+ md->anon += nr_pages;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)] += nr_pages;
+}
+
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pte_present(pte))
+ return NULL;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ spinlock_t *ptl;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+ md = walk->private;
+
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
+ pte_t huge_pte = *(pte_t *)pmd;
+ struct page *page;
+
+ page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ if (page)
+ gather_stats(page, md, pte_dirty(huge_pte),
+ HPAGE_PMD_SIZE/PAGE_SIZE);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ do {
+ struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+ if (!page)
+ continue;
+ gather_stats(page, md, pte_dirty(*pte), 1);
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(orig_pte, ptl);
+ return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ struct page *page;
+
+ if (!pte_present(*pte))
+ return 0;
+
+ page = pte_page(*pte);
+ if (!page)
+ return 0;
+
+ md = walk->private;
+ gather_stats(page, md, pte_dirty(*pte), 1);
+ return 0;
+}
+
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v, int is_pid)
+{
+ struct numa_maps_private *numa_priv = m->private;
+ struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md = &numa_priv->md;
+ struct file *file = vma->vm_file;
+ struct task_struct *task = proc_priv->task;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mm_walk walk = {};
+ struct mempolicy *pol;
+ char buffer[64];
+ int nid;
+
+ if (!mm)
+ return 0;
+
+ /* Ensure we start with an empty set of numa_maps statistics. */
+ memset(md, 0, sizeof(*md));
+
+ md->vma = vma;
+
+ walk.hugetlb_entry = gather_hugetbl_stats;
+ walk.pmd_entry = gather_pte_stats;
+ walk.private = md;
+ walk.mm = mm;
+
+ pol = get_vma_policy(task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_cond_put(pol);
+
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+ if (file) {
+ seq_puts(m, " file=");
+ seq_path(m, &file->f_path, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_puts(m, " heap");
+ } else {
+ pid_t tid = vm_is_stack(task, vma, is_pid);
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_puts(m, " stack");
+ else
+ seq_printf(m, " stack:%d", tid);
+ }
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ seq_puts(m, " huge");
+
+ walk_page_range(vma->vm_start, vma->vm_end, &walk);
+
+ if (!md->pages)
+ goto out;
+
+ if (md->anon)
+ seq_printf(m, " anon=%lu", md->anon);
+
+ if (md->dirty)
+ seq_printf(m, " dirty=%lu", md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m, " swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m, " active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m, " writeback=%lu", md->writeback);
+
+ for_each_node_state(nid, N_MEMORY)
+ if (md->node[nid])
+ seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+out:
+ seq_putc(m, '\n');
+
+ if (m->count < m->size)
+ m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+ return 0;
+}
+
+static int show_pid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 1);
+}
+
+static int show_tid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 0);
+}
static const struct seq_operations proc_pid_numa_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_numa_map,
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_pid_numa_map,
};
-static int numa_maps_open(struct inode *inode, struct file *file)
+static const struct seq_operations proc_tid_numa_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_numa_map,
+};
+
+static int numa_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
- return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+ struct numa_maps_private *priv;
+ int ret = -ENOMEM;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ priv->proc_maps.pid = proc_pid(inode);
+ ret = seq_open(file, ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = priv;
+ } else {
+ kfree(priv);
+ }
+ }
+ return ret;
}
-const struct file_operations proc_numa_maps_operations = {
- .open = numa_maps_open,
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+static int tid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+ .open = pid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif
+
+const struct file_operations proc_tid_numa_maps_operations = {
+ .open = tid_numa_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81..678455d2d68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
#include <linux/fs_struct.h>
#include <linux/mount.h>
#include <linux/ptrace.h>
+#include <linux/slab.h>
#include <linux/seq_file.h>
#include "internal.h"
@@ -91,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
return vsize;
}
-int task_statm(struct mm_struct *mm, int *shared, int *text,
- int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
{
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *p;
- int size = kobjsize(mm);
+ unsigned long size = kobjsize(mm);
down_read(&mm->mmap_sem);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
@@ -124,26 +126,30 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
/*
* display a single VMA to a sequenced file
*/
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
+ int is_pid)
{
+ struct mm_struct *mm = vma->vm_mm;
+ struct proc_maps_private *priv = m->private;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
- int flags, len;
+ int flags;
unsigned long long pgoff = 0;
flags = vma->vm_flags;
file = vma->vm_file;
if (file) {
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
}
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
vma->vm_start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
@@ -151,14 +157,26 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
pgoff,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
if (file) {
- len = 25 + sizeof(void *) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
+ } else if (mm) {
+ pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+
+ if (tid != 0) {
+ seq_pad(m, ' ');
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_printf(m, "[stack]");
+ else
+ seq_printf(m, "[stack:%d]", tid);
+ }
}
seq_putc(m, '\n');
@@ -168,11 +186,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_p)
+static int show_map(struct seq_file *m, void *_p, int is_pid)
{
struct rb_node *p = _p;
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
+ is_pid);
+}
+
+static int show_pid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 0);
}
static void *m_start(struct seq_file *m, loff_t *pos)
@@ -185,13 +214,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/* pin the task and mm whilst we play with them */
priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
if (!priv->task)
- return NULL;
+ return ERR_PTR(-ESRCH);
- mm = mm_for_maps(priv->task);
- if (!mm) {
+ mm = mm_access(priv->task, PTRACE_MODE_READ);
+ if (!mm || IS_ERR(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
- return NULL;
+ return mm;
}
down_read(&mm->mmap_sem);
@@ -226,10 +255,18 @@ static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_ops = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static int maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
struct proc_maps_private *priv;
int ret = -ENOMEM;
@@ -237,7 +274,7 @@ static int maps_open(struct inode *inode, struct file *file)
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->pid = proc_pid(inode);
- ret = seq_open(file, &proc_pid_maps_ops);
+ ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
@@ -248,8 +285,25 @@ static int maps_open(struct inode *inode, struct file *file)
return ret;
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_tid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d45605..33de567c25a 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,21 +5,25 @@
#include <linux/seq_file.h>
#include <linux/time.h>
#include <linux/kernel_stat.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec uptime;
struct timespec idle;
+ u64 idletime;
+ u64 nsec;
+ u32 rem;
int i;
- cputime_t idletime = cputime_zero;
+ idletime = 0;
for_each_possible_cpu(i)
- idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
+ idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
- do_posix_clock_monotonic_gettime(&uptime);
- monotonic_to_bootbased(&uptime);
- cputime_to_timespec(idletime, &idle);
+ get_monotonic_boottime(&uptime);
+ nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+ idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
@@ -45,4 +49,4 @@ static int __init proc_uptime_init(void)
proc_create("uptime", 0, NULL, &uptime_proc_fops);
return 0;
}
-module_init(proc_uptime_init);
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678..d2154eb6d78 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
proc_create("version", 0, NULL, &version_proc_fops);
return 0;
}
-module_init(proc_version_init);
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d3..382aa890e22 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,17 +8,23 @@
*/
#include <linux/mm.h>
-#include <linux/proc_fs.h>
+#include <linux/kcore.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/printk.h>
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <asm/io.h>
+#include "internal.h"
/* List representing chunks of contiguous memory areas and their offsets in
* vmcore file.
@@ -28,11 +34,55 @@ static LIST_HEAD(vmcore_list);
/* Stores the pointer to the buffer containing kernel elf core headers. */
static char *elfcorebuf;
static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
-static struct proc_dir_entry *proc_vmcore = NULL;
+static struct proc_dir_entry *proc_vmcore;
+
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * The called function has to take care of module refcounting.
+ */
+static int (*oldmem_pfn_is_ram)(unsigned long pfn);
+
+int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+ if (oldmem_pfn_is_ram)
+ return -EBUSY;
+ oldmem_pfn_is_ram = fn;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram);
+
+void unregister_oldmem_pfn_is_ram(void)
+{
+ oldmem_pfn_is_ram = NULL;
+ wmb();
+}
+EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram);
+
+static int pfn_is_ram(unsigned long pfn)
+{
+ int (*fn)(unsigned long pfn);
+ /* pfn is ram unless fn() checks pagetype */
+ int ret = 1;
+
+ /*
+ * Ask hypervisor if the pfn is really ram.
+ * A ballooned page contains no data and reading from such a page
+ * will cause high load in the hypervisor.
+ */
+ fn = oldmem_pfn_is_ram;
+ if (fn)
+ ret = fn(pfn);
+
+ return ret;
+}
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
@@ -54,9 +104,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
else
nr_bytes = count;
- tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf);
- if (tmp < 0)
- return tmp;
+ /* If pfn is not ram, return zeros for sparse dump files */
+ if (pfn_is_ram(pfn) == 0)
+ memset(buf, 0, nr_bytes);
+ else {
+ tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+ offset, userbuf);
+ if (tmp < 0)
+ return tmp;
+ }
*ppos += nr_bytes;
count -= nr_bytes;
buf += nr_bytes;
@@ -68,37 +124,70 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
return read;
}
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
- struct vmcore **m_ptr)
+/*
+ * Architectures may override this function to allocate ELF header in 2nd kernel
+ */
+int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
- struct vmcore *m;
- u64 paddr;
+ return 0;
+}
- list_for_each_entry(m, vc_list, list) {
- u64 start, end;
- start = m->offset;
- end = m->offset + m->size - 1;
- if (offset >= start && offset <= end) {
- paddr = m->paddr + offset - start;
- *m_ptr = m;
- return paddr;
- }
+/*
+ * Architectures may override this function to free header
+ */
+void __weak elfcorehdr_free(unsigned long long addr)
+{}
+
+/*
+ * Architectures may override this function to read from ELF header
+ */
+ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+ return read_from_oldmem(buf, count, ppos, 0);
+}
+
+/*
+ * Architectures may override this function to read from notes sections
+ */
+ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
+{
+ return read_from_oldmem(buf, count, ppos, 0);
+}
+
+/*
+ * Architectures may override this function to map oldmem
+ */
+int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
+/*
+ * Copy to either kernel or user space
+ */
+static int copy_to(void *target, void *src, size_t size, int userbuf)
+{
+ if (userbuf) {
+ if (copy_to_user((char __user *) target, src, size))
+ return -EFAULT;
+ } else {
+ memcpy(target, src, size);
}
- *m_ptr = NULL;
return 0;
}
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
+ int userbuf)
{
ssize_t acc = 0, tmp;
size_t tsz;
- u64 start, nr_bytes;
- struct vmcore *curr_m = NULL;
+ u64 start;
+ struct vmcore *m = NULL;
if (buflen == 0 || *fpos >= vmcore_size)
return 0;
@@ -109,10 +198,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = elfcorebuf_sz - *fpos;
- if (buflen < tsz)
- tsz = buflen;
- if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
+ if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
return -EFAULT;
buflen -= tsz;
*fpos += tsz;
@@ -124,146 +211,395 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
return acc;
}
- start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
- if (!curr_m)
- return -EINVAL;
- if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
- tsz = buflen;
-
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
-
- while (buflen) {
- tmp = read_from_oldmem(buffer, tsz, &start, 1);
- if (tmp < 0)
- return tmp;
+ /* Read Elf note segment */
+ if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+ if (copy_to(buffer, kaddr, tsz, userbuf))
+ return -EFAULT;
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
acc += tsz;
- if (start >= (curr_m->paddr + curr_m->size)) {
- if (curr_m->list.next == &vmcore_list)
- return acc; /*EOF*/
- curr_m = list_entry(curr_m->list.next,
- struct vmcore, list);
- start = curr_m->paddr;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (*fpos < m->offset + m->size) {
+ tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ start = m->paddr + *fpos - m->offset;
+ tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+ if (tmp < 0)
+ return tmp;
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (buflen == 0)
+ return acc;
}
- if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
- tsz = buflen;
- /* Calculate left bytes in current memory segment. */
- nr_bytes = (curr_m->size - (start - curr_m->paddr));
- if (tsz > nr_bytes)
- tsz = nr_bytes;
}
+
return acc;
}
-static const struct file_operations proc_vmcore_operations = {
- .read = read_vmcore,
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+ size_t buflen, loff_t *fpos)
+{
+ return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+}
+
+/*
+ * The vmcore fault handler uses the page cache and fills data using the
+ * standard __vmcore_read() function.
+ *
+ * On s390 the fault handler is used for memory regions that can't be mapped
+ * directly with remap_pfn_range().
+ */
+static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#ifdef CONFIG_S390
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t index = vmf->pgoff;
+ struct page *page;
+ loff_t offset;
+ char *buf;
+ int rc;
+
+ page = find_or_create_page(mapping, index, GFP_KERNEL);
+ if (!page)
+ return VM_FAULT_OOM;
+ if (!PageUptodate(page)) {
+ offset = (loff_t) index << PAGE_CACHE_SHIFT;
+ buf = __va((page_to_pfn(page) << PAGE_SHIFT));
+ rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ if (rc < 0) {
+ unlock_page(page);
+ page_cache_release(page);
+ return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ }
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ vmf->page = page;
+ return 0;
+#else
+ return VM_FAULT_SIGBUS;
+#endif
+}
+
+static const struct vm_operations_struct vmcore_mmap_ops = {
+ .fault = mmap_vmcore_fault,
};
-static struct vmcore* __init get_new_element(void)
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ * vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
{
- return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+#ifdef CONFIG_MMU
+ return vmalloc_user(notes_sz);
+#else
+ return vzalloc(notes_sz);
+#endif
}
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
- int i;
- u64 size;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr *phdr_ptr;
+ size_t size = vma->vm_end - vma->vm_start;
+ u64 start, end, len, tsz;
+ struct vmcore *m;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
- size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
+ start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ end = start + size;
+
+ if (size > vmcore_size || end > vmcore_size)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_ops = &vmcore_mmap_ops;
+
+ len = 0;
+
+ if (start < elfcorebuf_sz) {
+ u64 pfn;
+
+ tsz = min(elfcorebuf_sz - (size_t)start, size);
+ pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+ if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+ vma->vm_page_prot))
+ return -EAGAIN;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
}
- return size;
+
+ if (start < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+ kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+ kaddr, tsz))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (start < m->offset + m->size) {
+ u64 paddr = 0;
+
+ tsz = min_t(size_t, m->offset + m->size - start, size);
+ paddr = m->paddr + start - m->offset;
+ if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
+ paddr >> PAGE_SHIFT, tsz,
+ vma->vm_page_prot))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+ }
+
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, vma->vm_start, len);
+ return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ return -ENOSYS;
}
+#endif
+
+static const struct file_operations proc_vmcore_operations = {
+ .read = read_vmcore,
+ .llseek = default_llseek,
+ .mmap = mmap_vmcore,
+};
-static u64 __init get_vmcore_size_elf32(char *elfptr)
+static struct vmcore* __init get_new_element(void)
+{
+ return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+}
+
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
- int i;
u64 size;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr *phdr_ptr;
+ struct vmcore *m;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
- size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
- for (i = 0; i < ehdr_ptr->e_phnum; i++) {
- size += phdr_ptr->p_memsz;
- phdr_ptr++;
+ size = elfsz + elfnotesegsz;
+ list_for_each_entry(m, vc_list, list) {
+ size += m->size;
}
return size;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf64_Ehdr *ehdr_ptr;
- Elf64_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
Elf64_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
- phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
if (!notes_section)
return -ENOMEM;
- rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+ rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
if (rc < 0) {
kfree(notes_section);
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
- if (nhdr_ptr->n_namesz == 0)
- break;
+ while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf64_Nhdr) +
((nhdr_ptr->n_namesz + 3) & ~3) +
((nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
real_sz += sz;
nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf64_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+ &offset);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf64(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf64_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -277,6 +613,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -284,67 +622,170 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return 0;
}
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
- struct list_head *vc_list)
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
{
- int i, nr_ptnote=0, rc=0;
- char *tmp;
- Elf32_Ehdr *ehdr_ptr;
- Elf32_Phdr phdr, *phdr_ptr;
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
Elf32_Nhdr *nhdr_ptr;
- u64 phdr_sz = 0, note_off;
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
- phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
- int j;
void *notes_section;
- struct vmcore *new;
u64 offset, max_sz, sz, real_sz = 0;
if (phdr_ptr->p_type != PT_NOTE)
continue;
- nr_ptnote++;
max_sz = phdr_ptr->p_memsz;
offset = phdr_ptr->p_offset;
notes_section = kmalloc(max_sz, GFP_KERNEL);
if (!notes_section)
return -ENOMEM;
- rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+ rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
if (rc < 0) {
kfree(notes_section);
return rc;
}
nhdr_ptr = notes_section;
- for (j = 0; j < max_sz; j += sz) {
- if (nhdr_ptr->n_namesz == 0)
- break;
+ while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf32_Nhdr) +
((nhdr_ptr->n_namesz + 3) & ~3) +
((nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
real_sz += sz;
nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
}
-
- /* Add this contiguous chunk of notes section to vmcore list.*/
- new = get_new_element();
- if (!new) {
- kfree(notes_section);
- return -ENOMEM;
- }
- new->paddr = phdr_ptr->p_offset;
- new->size = real_sz;
- list_add_tail(&new->list, vc_list);
- phdr_sz += real_sz;
kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf32_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+ &offset);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
}
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf32(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
/* Prepare merged PT_NOTE program header. */
phdr.p_type = PT_NOTE;
phdr.p_flags = 0;
note_off = sizeof(Elf32_Ehdr) +
(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
- phdr.p_offset = note_off;
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
phdr.p_vaddr = phdr.p_paddr = 0;
phdr.p_filesz = phdr.p_memsz = phdr_sz;
phdr.p_align = 0;
@@ -358,6 +799,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
*elfsz = *elfsz - i;
memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -369,6 +812,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
* the new offset fields of exported program headers. */
static int __init process_ptload_program_headers_elf64(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -380,32 +824,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset. */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
static int __init process_ptload_program_headers_elf32(char *elfptr,
size_t elfsz,
+ size_t elfnotes_sz,
struct list_head *vc_list)
{
int i;
@@ -417,43 +867,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* First program header is PT_NOTE header. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
- phdr_ptr->p_memsz; /* Note sections */
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
if (phdr_ptr->p_type != PT_LOAD)
continue;
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
/* Add this contiguous chunk of memory to vmcore list.*/
new = get_new_element();
if (!new)
return -ENOMEM;
- new->paddr = phdr_ptr->p_offset;
- new->size = phdr_ptr->p_memsz;
+ new->paddr = start;
+ new->size = size;
list_add_tail(&new->list, vc_list);
/* Update the program header offset */
- phdr_ptr->p_offset = vmcore_off;
- vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
}
return 0;
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
- struct list_head *vc_list)
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
loff_t vmcore_off;
- Elf64_Ehdr *ehdr_ptr;
struct vmcore *m;
- ehdr_ptr = (Elf64_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf64_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
m->offset = vmcore_off;
@@ -461,24 +912,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
}
}
-/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
- struct list_head *vc_list)
+static void free_elfcorebuf(void)
{
- loff_t vmcore_off;
- Elf32_Ehdr *ehdr_ptr;
- struct vmcore *m;
-
- ehdr_ptr = (Elf32_Ehdr *)elfptr;
-
- /* Skip Elf header and program headers. */
- vmcore_off = sizeof(Elf32_Ehdr) +
- (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-
- list_for_each_entry(m, vc_list, list) {
- m->offset = vmcore_off;
- vmcore_off += m->size;
- }
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = NULL;
+ vfree(elfnotes_buf);
+ elfnotes_buf = NULL;
}
static int __init parse_crash_elf64_headers(void)
@@ -490,51 +929,51 @@ static int __init parse_crash_elf64_headers(void)
addr = elfcorehdr_addr;
/* Read Elf header */
- rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
if (rc < 0)
return rc;
/* Do some basic Verification. */
if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
(ehdr.e_type != ET_CORE) ||
- !vmcore_elf_check_arch(&ehdr) ||
+ !vmcore_elf64_check_arch(&ehdr) ||
ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
ehdr.e_version != EV_CURRENT ||
ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
ehdr.e_phnum == 0) {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- "sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+ ehdr.e_phnum * sizeof(Elf64_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf32_headers(void)
@@ -546,7 +985,7 @@ static int __init parse_crash_elf32_headers(void)
addr = elfcorehdr_addr;
/* Read Elf header */
- rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0);
+ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -560,37 +999,36 @@ static int __init parse_crash_elf32_headers(void)
ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
ehdr.e_phnum == 0) {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- "sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
/* Read in all elf headers. */
- elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
- elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+ elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
if (!elfcorebuf)
return -ENOMEM;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
- if (rc < 0) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+ if (rc < 0)
+ goto fail;
/* Merge all PT_NOTE headers into one. */
- rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
+ rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
- &vmcore_list);
- if (rc) {
- kfree(elfcorebuf);
- return rc;
- }
- set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
}
static int __init parse_crash_elf_headers(void)
@@ -600,12 +1038,11 @@ static int __init parse_crash_elf_headers(void)
int rc=0;
addr = elfcorehdr_addr;
- rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0);
+ rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
if (rc < 0)
return rc;
if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
- printk(KERN_WARNING "Warning: Core image elf header"
- " not found\n");
+ pr_warn("Warning: Core image elf header not found\n");
return -EINVAL;
}
@@ -613,21 +1050,19 @@ static int __init parse_crash_elf_headers(void)
rc = parse_crash_elf64_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf64(elfcorebuf);
} else if (e_ident[EI_CLASS] == ELFCLASS32) {
rc = parse_crash_elf32_headers();
if (rc)
return rc;
-
- /* Determine vmcore size. */
- vmcore_size = get_vmcore_size_elf32(elfcorebuf);
} else {
- printk(KERN_WARNING "Warning: Core image elf header is not"
- " sane\n");
+ pr_warn("Warning: Core image elf header is not sane\n");
return -EINVAL;
}
+
+ /* Determine vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+
return 0;
}
@@ -636,18 +1071,48 @@ static int __init vmcore_init(void)
{
int rc = 0;
- /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
+ /* Allow architectures to allocate ELF header in 2nd kernel */
+ rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
+ if (rc)
+ return rc;
+ /*
+ * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
+ * then capture the dump.
+ */
if (!(is_vmcore_usable()))
return rc;
rc = parse_crash_elf_headers();
if (rc) {
- printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+ pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
+ elfcorehdr_free(elfcorehdr_addr);
+ elfcorehdr_addr = ELFCORE_ADDR_ERR;
proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
if (proc_vmcore)
proc_vmcore->size = vmcore_size;
return 0;
}
-module_init(vmcore_init)
+fs_initcall(vmcore_init);
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+ struct list_head *pos, *next;
+
+ if (proc_vmcore) {
+ proc_remove(proc_vmcore);
+ proc_vmcore = NULL;
+ }
+
+ /* clear the vmcore list. */
+ list_for_each_safe(pos, next, &vmcore_list) {
+ struct vmcore *m;
+
+ m = list_entry(pos, struct vmcore, list);
+ list_del(&m->list);
+ kfree(m);
+ }
+ free_elfcorebuf();
+}