diff options
Diffstat (limited to 'fs/proc')
| -rw-r--r-- | fs/proc/Makefile | 1 | ||||
| -rw-r--r-- | fs/proc/array.c | 4 | ||||
| -rw-r--r-- | fs/proc/base.c | 118 | ||||
| -rw-r--r-- | fs/proc/generic.c | 1 | ||||
| -rw-r--r-- | fs/proc/inode.c | 7 | ||||
| -rw-r--r-- | fs/proc/internal.h | 26 | ||||
| -rw-r--r-- | fs/proc/namespaces.c | 201 | ||||
| -rw-r--r-- | fs/proc/proc_sysctl.c | 3 | ||||
| -rw-r--r-- | fs/proc/root.c | 11 | ||||
| -rw-r--r-- | fs/proc/stat.c | 6 | ||||
| -rw-r--r-- | fs/proc/task_mmu.c | 245 | ||||
| -rw-r--r-- | fs/proc/vmcore.c | 52 | 
12 files changed, 555 insertions, 120 deletions
| diff --git a/fs/proc/Makefile b/fs/proc/Makefile index df434c5f28f..c1c72933592 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -20,6 +20,7 @@ proc-y	+= stat.o  proc-y	+= uptime.o  proc-y	+= version.o  proc-y	+= softirqs.o +proc-y	+= namespaces.o  proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o  proc-$(CONFIG_NET)		+= proc_net.o  proc-$(CONFIG_PROC_KCORE)	+= kcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 5e4f776b091..9b45ee84fbc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -131,7 +131,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)   * you can test for combinations of others with   * simple bit tests.   */ -static const char *task_state_array[] = { +static const char * const task_state_array[] = {  	"R (running)",		/*   0 */  	"S (sleeping)",		/*   1 */  	"D (disk sleep)",	/*   2 */ @@ -147,7 +147,7 @@ static const char *task_state_array[] = {  static inline const char *get_task_state(struct task_struct *tsk)  {  	unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; -	const char **p = &task_state_array[0]; +	const char * const *p = &task_state_array[0];  	BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); diff --git a/fs/proc/base.c b/fs/proc/base.c index dfa532730e5..8a84210ca08 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,9 @@  #include <linux/pid_namespace.h>  #include <linux/fs_struct.h>  #include <linux/slab.h> +#ifdef CONFIG_HARDWALL +#include <asm/hardwall.h> +#endif  #include "internal.h"  /* NOTE: @@ -600,7 +603,7 @@ static int proc_fd_access_allowed(struct inode *inode)  	return allowed;  } -static int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct dentry *dentry, struct iattr *attr)  {  	int error;  	struct inode *inode = dentry->d_inode; @@ -894,20 +897,20 @@ static ssize_t mem_write(struct file * file, const char __user *buf,  	if (!task)  		goto out_no_task; +	copied = -ENOMEM; +	page = (char *)__get_free_page(GFP_TEMPORARY); +	if (!page) +		goto out_task; +  	mm = check_mem_permission(task);  	copied = PTR_ERR(mm);  	if (IS_ERR(mm)) -		goto out_task; +		goto out_free;  	copied = -EIO;  	if (file->private_data != (void *)((long)current->self_exec_id))  		goto out_mm; -	copied = -ENOMEM; -	page = (char *)__get_free_page(GFP_TEMPORARY); -	if (!page) -		goto out_mm; -  	copied = 0;  	while (count > 0) {  		int this_len, retval; @@ -929,9 +932,11 @@ static ssize_t mem_write(struct file * file, const char __user *buf,  		count -= retval;			  	}  	*ppos = dst; -	free_page((unsigned long) page); +  out_mm:  	mmput(mm); +out_free: +	free_page((unsigned long) page);  out_task:  	put_task_struct(task);  out_no_task: @@ -1059,7 +1064,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,  {  	struct task_struct *task;  	char buffer[PROC_NUMBUF]; -	long oom_adjust; +	int oom_adjust;  	unsigned long flags;  	int err; @@ -1071,7 +1076,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,  		goto out;  	} -	err = strict_strtol(strstrip(buffer), 0, &oom_adjust); +	err = kstrtoint(strstrip(buffer), 0, &oom_adjust);  	if (err)  		goto out;  	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && @@ -1168,7 +1173,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,  	struct task_struct *task;  	char buffer[PROC_NUMBUF];  	unsigned long flags; -	long oom_score_adj; +	int oom_score_adj;  	int err;  	memset(buffer, 0, sizeof(buffer)); @@ -1179,7 +1184,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,  		goto out;  	} -	err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); +	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);  	if (err)  		goto out;  	if (oom_score_adj < OOM_SCORE_ADJ_MIN || @@ -1468,7 +1473,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,  	struct inode *inode = file->f_path.dentry->d_inode;  	struct task_struct *p;  	char buffer[PROC_NUMBUF]; -	long nice; +	int nice;  	int err;  	memset(buffer, 0, sizeof(buffer)); @@ -1477,9 +1482,9 @@ sched_autogroup_write(struct file *file, const char __user *buf,  	if (copy_from_user(buffer, buf, count))  		return -EFAULT; -	err = strict_strtol(strstrip(buffer), 0, &nice); -	if (err) -		return -EINVAL; +	err = kstrtoint(strstrip(buffer), 0, &nice); +	if (err < 0) +		return err;  	p = get_proc_task(inode);  	if (!p) @@ -1576,57 +1581,6 @@ static const struct file_operations proc_pid_set_comm_operations = {  	.release	= single_release,  }; -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ -	mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ -	mm->num_exe_file_vmas--; -	if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ -		fput(mm->exe_file); -		mm->exe_file = NULL; -	} - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ -	if (new_exe_file) -		get_file(new_exe_file); -	if (mm->exe_file) -		fput(mm->exe_file); -	mm->exe_file = new_exe_file; -	mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ -	struct file *exe_file; - -	/* We need mmap_sem to protect against races with removal of -	 * VM_EXECUTABLE vmas */ -	down_read(&mm->mmap_sem); -	exe_file = mm->exe_file; -	if (exe_file) -		get_file(exe_file); -	up_read(&mm->mmap_sem); -	return exe_file; -} - -void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ -	/* It's safe to write the exe_file pointer without exe_file_lock because -	 * this is called during fork when the task is not yet in /proc */ -	newmm->exe_file = get_mm_exe_file(oldmm); -} -  static int proc_exe_link(struct inode *inode, struct path *exe_path)  {  	struct task_struct *task; @@ -1736,8 +1690,7 @@ static int task_dumpable(struct task_struct *task)  	return 0;  } - -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)  {  	struct inode * inode;  	struct proc_inode *ei; @@ -1779,7 +1732,7 @@ out_unlock:  	return NULL;  } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)  {  	struct inode *inode = dentry->d_inode;  	struct task_struct *task; @@ -1820,7 +1773,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat   * made this apply to all per process world readable and executable   * directories.   */ -static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, struct nameidata *nd)  {  	struct inode *inode;  	struct task_struct *task; @@ -1862,7 +1815,7 @@ static int pid_delete_dentry(const struct dentry * dentry)  	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;  } -static const struct dentry_operations pid_dentry_operations = +const struct dentry_operations pid_dentry_operations =  {  	.d_revalidate	= pid_revalidate,  	.d_delete	= pid_delete_dentry, @@ -1870,9 +1823,6 @@ static const struct dentry_operations pid_dentry_operations =  /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, -				struct task_struct *, const void *); -  /*   * Fill a directory entry.   * @@ -1885,8 +1835,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,   * reported by readdir in sync with the inode numbers reported   * by stat.   */ -static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, -	char *name, int len, +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +	const char *name, int len,  	instantiate_t instantiate, struct task_struct *task, const void *ptr)  {  	struct dentry *child, *dir = filp->f_path.dentry; @@ -2219,11 +2169,7 @@ static const struct file_operations proc_fd_operations = {   */  static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)  { -	int rv; - -	if (flags & IPERM_FLAG_RCU) -		return -ECHILD; -	rv = generic_permission(inode, mask, flags, NULL); +	int rv = generic_permission(inode, mask, flags, NULL);  	if (rv == 0)  		return 0;  	if (task_pid(current) == proc_pid(inode)) @@ -2820,6 +2766,7 @@ static const struct pid_entry tgid_base_stuff[] = {  	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),  	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),  	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), +	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),  #ifdef CONFIG_NET  	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),  #endif @@ -2894,6 +2841,9 @@ static const struct pid_entry tgid_base_stuff[] = {  #ifdef CONFIG_TASK_IO_ACCOUNTING  	INF("io",	S_IRUGO, proc_tgid_io_accounting),  #endif +#ifdef CONFIG_HARDWALL +	INF("hardwall",   S_IRUGO, proc_pid_hardwall), +#endif  };  static int proc_tgid_base_readdir(struct file * filp, @@ -3168,6 +3118,7 @@ out_no_task:  static const struct pid_entry tid_base_stuff[] = {  	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),  	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), +	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),  	REG("environ",   S_IRUSR, proc_environ_operations),  	INF("auxv",      S_IRUSR, proc_pid_auxv),  	ONE("status",    S_IRUGO, proc_pid_status), @@ -3232,6 +3183,9 @@ static const struct pid_entry tid_base_stuff[] = {  #ifdef CONFIG_TASK_IO_ACCOUNTING  	INF("io",	S_IRUGO, proc_tid_io_accounting),  #endif +#ifdef CONFIG_HARDWALL +	INF("hardwall",   S_IRUGO, proc_pid_hardwall), +#endif  };  static int proc_tid_base_readdir(struct file * filp, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f1281339b6f..f1637f17c37 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,  	}  	return ent;  } +EXPORT_SYMBOL(proc_mkdir_mode);  struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,  		struct proc_dir_entry *parent) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d15aa1b1cc8..74b48cfa1bb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)  {  	struct proc_dir_entry *de;  	struct ctl_table_header *head; +	const struct proc_ns_operations *ns_ops;  	truncate_inode_pages(&inode->i_data, 0);  	end_writeback(inode); @@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)  		rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);  		sysctl_head_put(head);  	} +	/* Release any associated namespace */ +	ns_ops = PROC_I(inode)->ns_ops; +	if (ns_ops && ns_ops->put) +		ns_ops->put(PROC_I(inode)->ns);  }  static struct kmem_cache * proc_inode_cachep; @@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)  	ei->pde = NULL;  	ei->sysctl = NULL;  	ei->sysctl_entry = NULL; +	ei->ns = NULL; +	ei->ns_ops = NULL;  	inode = &ei->vfs_inode;  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  	return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c03e8d3a3a5..7838e5cfec1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;  extern const struct file_operations proc_net_operations;  extern const struct inode_operations proc_net_inode_operations; +struct proc_maps_private { +	struct pid *pid; +	struct task_struct *task; +#ifdef CONFIG_MMU +	struct vm_area_struct *tail_vma; +#endif +}; +  void proc_init_inodecache(void);  static inline struct pid *proc_pid(struct inode *inode) @@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);   */  int proc_readdir(struct file *, void *, filldir_t);  struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); + + + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, +				struct task_struct *, const void *); +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +	const char *name, int len, +	instantiate_t instantiate, struct task_struct *task, const void *ptr); +int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); +extern const struct dentry_operations pid_dentry_operations; +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int proc_setattr(struct dentry *dentry, struct iattr *attr); + +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 00000000000..be177f702ac --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,201 @@ +#include <linux/proc_fs.h> +#include <linux/nsproxy.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> +#include <linux/mnt_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/pid_namespace.h> +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS +	&netns_operations, +#endif +#ifdef CONFIG_UTS_NS +	&utsns_operations, +#endif +#ifdef CONFIG_IPC_NS +	&ipcns_operations, +#endif +}; + +static const struct file_operations ns_file_operations = { +	.llseek		= no_llseek, +}; + +static struct dentry *proc_ns_instantiate(struct inode *dir, +	struct dentry *dentry, struct task_struct *task, const void *ptr) +{ +	const struct proc_ns_operations *ns_ops = ptr; +	struct inode *inode; +	struct proc_inode *ei; +	struct dentry *error = ERR_PTR(-ENOENT); +	void *ns; + +	inode = proc_pid_make_inode(dir->i_sb, task); +	if (!inode) +		goto out; + +	ns = ns_ops->get(task); +	if (!ns) +		goto out_iput; + +	ei = PROC_I(inode); +	inode->i_mode = S_IFREG|S_IRUSR; +	inode->i_fop  = &ns_file_operations; +	ei->ns_ops    = ns_ops; +	ei->ns	      = ns; + +	dentry->d_op = &pid_dentry_operations; +	d_add(dentry, inode); +	/* Close the race of the process dying before we return the dentry */ +	if (pid_revalidate(dentry, NULL)) +		error = NULL; +out: +	return error; +out_iput: +	iput(inode); +	goto out; +} + +static int proc_ns_fill_cache(struct file *filp, void *dirent, +	filldir_t filldir, struct task_struct *task, +	const struct proc_ns_operations *ops) +{ +	return proc_fill_cache(filp, dirent, filldir, +				ops->name, strlen(ops->name), +				proc_ns_instantiate, task, ops); +} + +static int proc_ns_dir_readdir(struct file *filp, void *dirent, +				filldir_t filldir) +{ +	int i; +	struct dentry *dentry = filp->f_path.dentry; +	struct inode *inode = dentry->d_inode; +	struct task_struct *task = get_proc_task(inode); +	const struct proc_ns_operations **entry, **last; +	ino_t ino; +	int ret; + +	ret = -ENOENT; +	if (!task) +		goto out_no_task; + +	ret = -EPERM; +	if (!ptrace_may_access(task, PTRACE_MODE_READ)) +		goto out; + +	ret = 0; +	i = filp->f_pos; +	switch (i) { +	case 0: +		ino = inode->i_ino; +		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) +			goto out; +		i++; +		filp->f_pos++; +		/* fall through */ +	case 1: +		ino = parent_ino(dentry); +		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) +			goto out; +		i++; +		filp->f_pos++; +		/* fall through */ +	default: +		i -= 2; +		if (i >= ARRAY_SIZE(ns_entries)) { +			ret = 1; +			goto out; +		} +		entry = ns_entries + i; +		last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; +		while (entry <= last) { +			if (proc_ns_fill_cache(filp, dirent, filldir, +						task, *entry) < 0) +				goto out; +			filp->f_pos++; +			entry++; +		} +	} + +	ret = 1; +out: +	put_task_struct(task); +out_no_task: +	return ret; +} + +const struct file_operations proc_ns_dir_operations = { +	.read		= generic_read_dir, +	.readdir	= proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, +				struct dentry *dentry, struct nameidata *nd) +{ +	struct dentry *error; +	struct task_struct *task = get_proc_task(dir); +	const struct proc_ns_operations **entry, **last; +	unsigned int len = dentry->d_name.len; + +	error = ERR_PTR(-ENOENT); + +	if (!task) +		goto out_no_task; + +	error = ERR_PTR(-EPERM); +	if (!ptrace_may_access(task, PTRACE_MODE_READ)) +		goto out; + +	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; +	for (entry = ns_entries; entry <= last; entry++) { +		if (strlen((*entry)->name) != len) +			continue; +		if (!memcmp(dentry->d_name.name, (*entry)->name, len)) +			break; +	} +	error = ERR_PTR(-ENOENT); +	if (entry > last) +		goto out; + +	error = proc_ns_instantiate(dir, dentry, task, *entry); +out: +	put_task_struct(task); +out_no_task: +	return error; +} + +const struct inode_operations proc_ns_dir_inode_operations = { +	.lookup		= proc_ns_dir_lookup, +	.getattr	= pid_getattr, +	.setattr	= proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ +	struct file *file; + +	file = fget(fd); +	if (!file) +		return ERR_PTR(-EBADF); + +	if (file->f_op != &ns_file_operations) +		goto out_invalid; + +	return file; + +out_invalid: +	fput(file); +	return ERR_PTR(-EINVAL); +} + diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c11c2..d167de365a8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)  	struct ctl_table *table;  	int error; -	if (flags & IPERM_FLAG_RCU) -		return -ECHILD; -  	/* Executable files are not allowed under /proc/sys/ */  	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))  		return -EACCES; diff --git a/fs/proc/root.c b/fs/proc/root.c index a9000e9cfee..d6c3b416529 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data)  static int proc_set_super(struct super_block *sb, void *data)  { -	struct pid_namespace *ns; - -	ns = (struct pid_namespace *)data; -	sb->s_fs_info = get_pid_ns(ns); -	return set_anon_super(sb, NULL); +	int err = set_anon_super(sb, NULL); +	if (!err) { +		struct pid_namespace *ns = (struct pid_namespace *)data; +		sb->s_fs_info = get_pid_ns(ns); +	} +	return err;  }  static struct dentry *proc_mount(struct file_system_type *fs_type, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 1cffa2b8a2f..9758b654a1b 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -138,9 +138,9 @@ static int stat_open(struct inode *inode, struct file *file)  	struct seq_file *m;  	int res; -	/* don't ask for more than the kmalloc() max size, currently 128 KB */ -	if (size > 128 * 1024) -		size = 128 * 1024; +	/* don't ask for more than the kmalloc() max size */ +	if (size > KMALLOC_MAX_SIZE) +		size = KMALLOC_MAX_SIZE;  	buf = kmalloc(size, GFP_KERNEL);  	if (!buf)  		return -ENOMEM; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2e7addfd980..25b6a887adb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -211,10 +211,10 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)  {  	struct mm_struct *mm = vma->vm_mm;  	struct file *file = vma->vm_file; -	int flags = vma->vm_flags; +	vm_flags_t flags = vma->vm_flags;  	unsigned long ino = 0;  	unsigned long long pgoff = 0; -	unsigned long start; +	unsigned long start, end;  	dev_t dev = 0;  	int len; @@ -227,13 +227,15 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)  	/* We don't show the stack guard page in /proc/maps */  	start = vma->vm_start; -	if (vma->vm_flags & VM_GROWSDOWN) -		if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) -			start += PAGE_SIZE; +	if (stack_guard_page_start(vma, start)) +		start += PAGE_SIZE; +	end = vma->vm_end; +	if (stack_guard_page_end(vma, end)) +		end -= PAGE_SIZE;  	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",  			start, -			vma->vm_end, +			end,  			flags & VM_READ ? 'r' : '-',  			flags & VM_WRITE ? 'w' : '-',  			flags & VM_EXEC ? 'x' : '-', @@ -534,15 +536,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,  	char buffer[PROC_NUMBUF];  	struct mm_struct *mm;  	struct vm_area_struct *vma; -	long type; +	int type; +	int rv;  	memset(buffer, 0, sizeof(buffer));  	if (count > sizeof(buffer) - 1)  		count = sizeof(buffer) - 1;  	if (copy_from_user(buffer, buf, count))  		return -EFAULT; -	if (strict_strtol(strstrip(buffer), 10, &type)) -		return -EINVAL; +	rv = kstrtoint(strstrip(buffer), 10, &type); +	if (rv < 0) +		return rv;  	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)  		return -EINVAL;  	task = get_proc_task(file->f_path.dentry->d_inode); @@ -767,18 +771,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  	if (!task)  		goto out; -	mm = mm_for_maps(task); -	ret = PTR_ERR(mm); -	if (!mm || IS_ERR(mm)) -		goto out_task; -  	ret = -EINVAL;  	/* file position must be aligned */  	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))  		goto out_task;  	ret = 0; -  	if (!count)  		goto out_task; @@ -786,7 +784,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);  	ret = -ENOMEM;  	if (!pm.buffer) -		goto out_mm; +		goto out_task; + +	mm = mm_for_maps(task); +	ret = PTR_ERR(mm); +	if (!mm || IS_ERR(mm)) +		goto out_free;  	pagemap_walk.pmd_entry = pagemap_pte_range;  	pagemap_walk.pte_hole = pagemap_pte_hole; @@ -829,7 +832,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  		len = min(count, PM_ENTRY_BYTES * pm.pos);  		if (copy_to_user(buf, pm.buffer, len)) {  			ret = -EFAULT; -			goto out_free; +			goto out_mm;  		}  		copied += len;  		buf += len; @@ -839,10 +842,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,  	if (!ret || ret == PM_END_OF_BUFFER)  		ret = copied; -out_free: -	kfree(pm.buffer);  out_mm:  	mmput(mm); +out_free: +	kfree(pm.buffer);  out_task:  	put_task_struct(task);  out: @@ -856,7 +859,192 @@ const struct file_operations proc_pagemap_operations = {  #endif /* CONFIG_PROC_PAGE_MONITOR */  #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); + +struct numa_maps { +	struct vm_area_struct *vma; +	unsigned long pages; +	unsigned long anon; +	unsigned long active; +	unsigned long writeback; +	unsigned long mapcount_max; +	unsigned long dirty; +	unsigned long swapcache; +	unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { +	struct proc_maps_private proc_maps; +	struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +{ +	int count = page_mapcount(page); + +	md->pages++; +	if (pte_dirty || PageDirty(page)) +		md->dirty++; + +	if (PageSwapCache(page)) +		md->swapcache++; + +	if (PageActive(page) || PageUnevictable(page)) +		md->active++; + +	if (PageWriteback(page)) +		md->writeback++; + +	if (PageAnon(page)) +		md->anon++; + +	if (count > md->mapcount_max) +		md->mapcount_max = count; + +	md->node[page_to_nid(page)]++; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, +		unsigned long end, struct mm_walk *walk) +{ +	struct numa_maps *md; +	spinlock_t *ptl; +	pte_t *orig_pte; +	pte_t *pte; + +	md = walk->private; +	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); +	do { +		struct page *page; +		int nid; + +		if (!pte_present(*pte)) +			continue; + +		page = vm_normal_page(md->vma, addr, *pte); +		if (!page) +			continue; + +		if (PageReserved(page)) +			continue; + +		nid = page_to_nid(page); +		if (!node_isset(nid, node_states[N_HIGH_MEMORY])) +			continue; + +		gather_stats(page, md, pte_dirty(*pte)); + +	} while (pte++, addr += PAGE_SIZE, addr != end); +	pte_unmap_unlock(orig_pte, ptl); +	return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +		unsigned long addr, unsigned long end, struct mm_walk *walk) +{ +	struct numa_maps *md; +	struct page *page; + +	if (pte_none(*pte)) +		return 0; + +	page = pte_page(*pte); +	if (!page) +		return 0; + +	md = walk->private; +	gather_stats(page, md, pte_dirty(*pte)); +	return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +		unsigned long addr, unsigned long end, struct mm_walk *walk) +{ +	return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ +	struct numa_maps_private *numa_priv = m->private; +	struct proc_maps_private *proc_priv = &numa_priv->proc_maps; +	struct vm_area_struct *vma = v; +	struct numa_maps *md = &numa_priv->md; +	struct file *file = vma->vm_file; +	struct mm_struct *mm = vma->vm_mm; +	struct mm_walk walk = {}; +	struct mempolicy *pol; +	int n; +	char buffer[50]; + +	if (!mm) +		return 0; + +	/* Ensure we start with an empty set of numa_maps statistics. */ +	memset(md, 0, sizeof(*md)); + +	md->vma = vma; + +	walk.hugetlb_entry = gather_hugetbl_stats; +	walk.pmd_entry = gather_pte_stats; +	walk.private = md; +	walk.mm = mm; + +	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); +	mpol_to_str(buffer, sizeof(buffer), pol, 0); +	mpol_cond_put(pol); + +	seq_printf(m, "%08lx %s", vma->vm_start, buffer); + +	if (file) { +		seq_printf(m, " file="); +		seq_path(m, &file->f_path, "\n\t= "); +	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { +		seq_printf(m, " heap"); +	} else if (vma->vm_start <= mm->start_stack && +			vma->vm_end >= mm->start_stack) { +		seq_printf(m, " stack"); +	} + +	walk_page_range(vma->vm_start, vma->vm_end, &walk); + +	if (!md->pages) +		goto out; + +	if (md->anon) +		seq_printf(m, " anon=%lu", md->anon); + +	if (md->dirty) +		seq_printf(m, " dirty=%lu", md->dirty); + +	if (md->pages != md->anon && md->pages != md->dirty) +		seq_printf(m, " mapped=%lu", md->pages); + +	if (md->mapcount_max > 1) +		seq_printf(m, " mapmax=%lu", md->mapcount_max); + +	if (md->swapcache) +		seq_printf(m, " swapcache=%lu", md->swapcache); + +	if (md->active < md->pages && !is_vm_hugetlb_page(vma)) +		seq_printf(m, " active=%lu", md->active); + +	if (md->writeback) +		seq_printf(m, " writeback=%lu", md->writeback); + +	for_each_node_state(n, N_HIGH_MEMORY) +		if (md->node[n]) +			seq_printf(m, " N%d=%lu", n, md->node[n]); +out: +	seq_putc(m, '\n'); + +	if (m->count < m->size) +		m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; +	return 0; +}  static const struct seq_operations proc_pid_numa_maps_op = {          .start  = m_start, @@ -867,7 +1055,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {  static int numa_maps_open(struct inode *inode, struct file *file)  { -	return do_maps_open(inode, file, &proc_pid_numa_maps_op); +	struct numa_maps_private *priv; +	int ret = -ENOMEM; +	priv = kzalloc(sizeof(*priv), GFP_KERNEL); +	if (priv) { +		priv->proc_maps.pid = proc_pid(inode); +		ret = seq_open(file, &proc_pid_numa_maps_op); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = priv; +		} else { +			kfree(priv); +		} +	} +	return ret;  }  const struct file_operations proc_numa_maps_operations = { @@ -876,4 +1077,4 @@ const struct file_operations proc_numa_maps_operations = {  	.llseek		= seq_lseek,  	.release	= seq_release_private,  }; -#endif +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 74802bc5ded..cd99bf55765 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -35,6 +35,46 @@ static u64 vmcore_size;  static struct proc_dir_entry *proc_vmcore = NULL; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ +	if (oldmem_pfn_is_ram) +		return -EBUSY; +	oldmem_pfn_is_ram = fn; +	return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ +	oldmem_pfn_is_ram = NULL; +	wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ +	int (*fn)(unsigned long pfn); +	/* pfn is ram unless fn() checks pagetype */ +	int ret = 1; + +	/* +	 * Ask hypervisor if the pfn is really ram. +	 * A ballooned page contains no data and reading from such a page +	 * will cause high load in the hypervisor. +	 */ +	fn = oldmem_pfn_is_ram; +	if (fn) +		ret = fn(pfn); + +	return ret; +} +  /* Reads a page from the oldmem device from given offset. */  static ssize_t read_from_oldmem(char *buf, size_t count,  				u64 *ppos, int userbuf) @@ -55,9 +95,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,  		else  			nr_bytes = count; -		tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); -		if (tmp < 0) -			return tmp; +		/* If pfn is not ram, return zeros for sparse dump files */ +		if (pfn_is_ram(pfn) == 0) +			memset(buf, 0, nr_bytes); +		else { +			tmp = copy_oldmem_page(pfn, buf, nr_bytes, +						offset, userbuf); +			if (tmp < 0) +				return tmp; +		}  		*ppos += nr_bytes;  		count -= nr_bytes;  		buf += nr_bytes; | 
