From 4e4d6d860b9393c5395ba5920edb5b4c5d43a3a3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 18 Dec 2011 20:05:43 -0800 Subject: sysfs: Add s_hash to sysfs_dirent and order directory entries by hash Compute a 31 bit hash of directory entries (that can fit in a signed 32bit off_t) and index the sysfs directory entries by that hash, replacing the per directory indexes by name and by inode. Because we now only use a single rbtree this reduces the size of sysfs_dirent by 2 pointers. Because we have fewer cases to deal with the code is now simpler. For now I use the simple hash that the dcache uses as that is easy to use and seems simple enough. In addition to makeing the code simpler using a hash for the file position in readdir brings sysfs in line with other filesystems that have non-trivial directory structures. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 219 +++++++++++++++++++++++++++++-------------------------- fs/sysfs/sysfs.h | 9 +-- 2 files changed, 120 insertions(+), 108 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 7fdf6a7b743..0daf255b7bf 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -22,76 +22,103 @@ #include #include #include +#include #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); DEFINE_SPINLOCK(sysfs_assoc_lock); +#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb); + static DEFINE_SPINLOCK(sysfs_ino_lock); static DEFINE_IDA(sysfs_ino_ida); /** - * sysfs_link_sibling - link sysfs_dirent into sibling list + * sysfs_name_hash + * @ns: Namespace tag to hash + * @name: Null terminated string to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int sysfs_name_hash(const void *ns, const char *name) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) ); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 1) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int sysfs_name_compare(unsigned int hash, const void *ns, + const char *name, const struct sysfs_dirent *sd) +{ + if (hash != sd->s_hash) + return hash - sd->s_hash; + if (ns != sd->s_ns) + return ns - sd->s_ns; + return strcmp(name, sd->s_name); +} + +static int sysfs_sd_compare(const struct sysfs_dirent *left, + const struct sysfs_dirent *right) +{ + return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name, + right); +} + +/** + * sysfs_link_subling - link sysfs_dirent into sibling rbtree * @sd: sysfs_dirent of interest * - * Link @sd into its sibling list which starts from + * Link @sd into its sibling rbtree which starts from * sd->s_parent->s_dir.children. * * Locking: * mutex_lock(sysfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. */ -static void sysfs_link_sibling(struct sysfs_dirent *sd) +static int sysfs_link_sibling(struct sysfs_dirent *sd) { - struct sysfs_dirent *parent_sd = sd->s_parent; - - struct rb_node **p; - struct rb_node *parent; + struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; + struct rb_node *parent = NULL; if (sysfs_type(sd) == SYSFS_DIR) - parent_sd->s_dir.subdirs++; - - p = &parent_sd->s_dir.inode_tree.rb_node; - parent = NULL; - while (*p) { - parent = *p; -#define node rb_entry(parent, struct sysfs_dirent, inode_node) - if (sd->s_ino < node->s_ino) { - p = &node->inode_node.rb_left; - } else if (sd->s_ino > node->s_ino) { - p = &node->inode_node.rb_right; - } else { - printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", - (unsigned long) sd->s_ino); - BUG(); - } -#undef node - } - rb_link_node(&sd->inode_node, parent, p); - rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); - - p = &parent_sd->s_dir.name_tree.rb_node; - parent = NULL; - while (*p) { - int c; - parent = *p; -#define node rb_entry(parent, struct sysfs_dirent, name_node) - c = strcmp(sd->s_name, node->s_name); - if (c < 0) { - p = &node->name_node.rb_left; - } else { - p = &node->name_node.rb_right; - } -#undef node + sd->s_parent->s_dir.subdirs++; + + while (*node) { + struct sysfs_dirent *pos; + int result; + + pos = to_sysfs_dirent(*node); + parent = *node; + result = sysfs_sd_compare(sd, pos); + if (result < 0) + node = &pos->s_rb.rb_left; + else if (result > 0) + node = &pos->s_rb.rb_right; + else + return -EEXIST; } - rb_link_node(&sd->name_node, parent, p); - rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); + /* add new node and rebalance the tree */ + rb_link_node(&sd->s_rb, parent, node); + rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); + return 0; } /** - * sysfs_unlink_sibling - unlink sysfs_dirent from sibling list + * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree * @sd: sysfs_dirent of interest * - * Unlink @sd from its sibling list which starts from + * Unlink @sd from its sibling rbtree which starts from * sd->s_parent->s_dir.children. * * Locking: @@ -102,8 +129,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) if (sysfs_type(sd) == SYSFS_DIR) sd->s_parent->s_dir.subdirs--; - rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); - rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); + rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); } /** @@ -402,6 +428,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; + int ret; if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", @@ -410,12 +437,12 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) return -EINVAL; } - if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) - return -EEXIST; - + sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); sd->s_parent = sysfs_get(acxt->parent_sd); - sysfs_link_sibling(sd); + ret = sysfs_link_sibling(sd); + if (ret) + return ret; /* Update timestamps on the parent */ ps_iattr = acxt->parent_sd->s_iattr; @@ -565,8 +592,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, const void *ns, const unsigned char *name) { - struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; - struct sysfs_dirent *found = NULL; + struct rb_node *node = parent_sd->s_dir.children.rb_node; + unsigned int hash; if (!!sysfs_ns_type(parent_sd) != !!ns) { WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", @@ -575,33 +602,21 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, return NULL; } - while (p) { - int c; -#define node rb_entry(p, struct sysfs_dirent, name_node) - c = strcmp(name, node->s_name); - if (c < 0) { - p = node->name_node.rb_left; - } else if (c > 0) { - p = node->name_node.rb_right; - } else { - found = node; - p = node->name_node.rb_left; - } -#undef node - } - - if (found) { - while (found->s_ns != ns) { - p = rb_next(&found->name_node); - if (!p) - return NULL; - found = rb_entry(p, struct sysfs_dirent, name_node); - if (strcmp(name, found->s_name)) - return NULL; - } + hash = sysfs_name_hash(ns, name); + while (node) { + struct sysfs_dirent *sd; + int result; + + sd = to_sysfs_dirent(node); + result = sysfs_name_compare(hash, ns, name, sd); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return sd; } - - return found; + return NULL; } /** @@ -804,9 +819,9 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd) pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); sysfs_addrm_start(&acxt, dir_sd); - pos = rb_first(&dir_sd->s_dir.inode_tree); + pos = rb_first(&dir_sd->s_dir.children); while (pos) { - struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); + struct sysfs_dirent *sd = to_sysfs_dirent(pos); pos = rb_next(pos); if (sysfs_type(sd) != SYSFS_DIR) sysfs_remove_one(&acxt, sd); @@ -919,38 +934,36 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp) } static struct sysfs_dirent *sysfs_dir_pos(const void *ns, - struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) + struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos) { if (pos) { int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && pos->s_parent == parent_sd && - ino == pos->s_ino; + hash == pos->s_hash; sysfs_put(pos); if (!valid) pos = NULL; } - if (!pos && (ino > 1) && (ino < INT_MAX)) { - struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; - while (p) { -#define node rb_entry(p, struct sysfs_dirent, inode_node) - if (ino < node->s_ino) { - pos = node; - p = node->inode_node.rb_left; - } else if (ino > node->s_ino) { - p = node->inode_node.rb_right; - } else { - pos = node; + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent_sd->s_dir.children.rb_node; + while (node) { + pos = to_sysfs_dirent(node); + + if (hash < pos->s_hash) + node = node->rb_left; + else if (hash > pos->s_hash) + node = node->rb_right; + else break; - } -#undef node } } + /* Skip over entries in the wrong namespace */ while (pos && pos->s_ns != ns) { - struct rb_node *p = rb_next(&pos->inode_node); - if (!p) + struct rb_node *node = rb_next(&pos->s_rb); + if (!node) pos = NULL; else - pos = rb_entry(p, struct sysfs_dirent, inode_node); + pos = to_sysfs_dirent(node); } return pos; } @@ -960,11 +973,11 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, { pos = sysfs_dir_pos(ns, parent_sd, ino, pos); if (pos) do { - struct rb_node *p = rb_next(&pos->inode_node); - if (!p) + struct rb_node *node = rb_next(&pos->s_rb); + if (!node) pos = NULL; else - pos = rb_entry(p, struct sysfs_dirent, inode_node); + pos = to_sysfs_dirent(node); } while (pos && pos->s_ns != ns); return pos; } @@ -1006,7 +1019,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) len = strlen(name); ino = pos->s_ino; type = dt_type(pos); - filp->f_pos = ino; + filp->f_pos = pos->s_hash; filp->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 7484a36ee67..2b5c923b4b9 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -20,9 +20,8 @@ struct sysfs_elem_dir { struct kobject *kobj; unsigned long subdirs; - - struct rb_root inode_tree; - struct rb_root name_tree; + /* children rbtree starts here and goes through sd->s_rb */ + struct rb_root children; }; struct sysfs_elem_symlink { @@ -62,8 +61,7 @@ struct sysfs_dirent { struct sysfs_dirent *s_parent; const char *s_name; - struct rb_node inode_node; - struct rb_node name_node; + struct rb_node s_rb; union { struct completion *completion; @@ -71,6 +69,7 @@ struct sysfs_dirent { } u; const void *s_ns; /* namespace tag */ + unsigned int s_hash; /* ns + name hash */ union { struct sysfs_elem_dir s_dir; struct sysfs_elem_symlink s_symlink; -- cgit v1.2.3-70-g09d2 From 15a3382451e51925facfe430deeca63d90137f5d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 18 Dec 2011 20:07:23 -0800 Subject: sysfs: Reduce s_flags to an unsinged short so it packs well with s_mode On 32bit this reduces sizeof(struct sysfs_dirent) by 2 bytes. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/sysfs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 2b5c923b4b9..19994948ac5 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -77,7 +77,7 @@ struct sysfs_dirent { struct sysfs_elem_bin_attr s_bin_attr; }; - unsigned int s_flags; + unsigned short s_flags; umode_t s_mode; ino_t s_ino; struct sysfs_inode_attrs *s_iattr; @@ -94,11 +94,11 @@ struct sysfs_dirent { #define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) /* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xff00 +#define SYSFS_NS_TYPE_MASK 0xf00 #define SYSFS_NS_TYPE_SHIFT 8 #define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED 0x020000 +#define SYSFS_FLAG_REMOVED 0x02000 static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { -- cgit v1.2.3-70-g09d2 From cafa6b5dd7ce4f0e0a30be301be4efed587a7808 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 18 Dec 2011 20:08:16 -0800 Subject: sysfs: Store the sysfs inode in an unsigned int. Store the sysfs inode number in an unsided int because ida inode allocator can return at most a 31 bit number, reducing the size of struct sysfs_dirent by 8 bytes on 64bit platforms. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++-- fs/sysfs/sysfs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0daf255b7bf..0589c9a694b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -224,7 +224,7 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) rwsem_release(&sd->dep_map, 1, _RET_IP_); } -static int sysfs_alloc_ino(ino_t *pino) +static int sysfs_alloc_ino(unsigned int *pino) { int ino, rc; @@ -243,7 +243,7 @@ static int sysfs_alloc_ino(ino_t *pino) return rc; } -static void sysfs_free_ino(ino_t ino) +static void sysfs_free_ino(unsigned int ino) { spin_lock(&sysfs_ino_lock); ida_remove(&sysfs_ino_ida, ino); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 19994948ac5..661a9639570 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -79,7 +79,7 @@ struct sysfs_dirent { unsigned short s_flags; umode_t s_mode; - ino_t s_ino; + unsigned int s_ino; struct sysfs_inode_attrs *s_iattr; }; -- cgit v1.2.3-70-g09d2 From 524b6c5b39b931311dfe5a2f5abae2f5c9731676 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 18 Dec 2011 20:09:31 -0800 Subject: sysfs: Kill nlink counting. Tracking the number of subdirectories requires an extra field that increases the size of sysfs_dirent. nlinks are not particularly interesting for sysfs and the nlink counts are wrong when network namespaces are involved so stop counting them, and always return nlink == 1. Userspace already knows that directories with nlink == 1 have an nlink count they can't use to count subdirectories. This reduces the size of sysfs_dirent by 8 bytes on 64bit platforms. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 6 ------ fs/sysfs/inode.c | 3 --- fs/sysfs/sysfs.h | 1 - 3 files changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0589c9a694b..ea64d01400a 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -91,9 +91,6 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; struct rb_node *parent = NULL; - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs++; - while (*node) { struct sysfs_dirent *pos; int result; @@ -126,9 +123,6 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs--; - rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4a802b4a905..0ac3e1c1a7d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -216,9 +216,6 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) iattrs->ia_secdata, iattrs->ia_secdata_len); } - - if (sysfs_type(sd) == SYSFS_DIR) - set_nlink(inode, sd->s_dir.subdirs + 2); } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 661a9639570..6289a00287d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -19,7 +19,6 @@ struct sysfs_open_dirent; struct sysfs_elem_dir { struct kobject *kobj; - unsigned long subdirs; /* children rbtree starts here and goes through sd->s_rb */ struct rb_root children; }; -- cgit v1.2.3-70-g09d2 From a4834c102f4a46808630cad1a545cb0706b3b0a2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 5 Jan 2012 13:06:02 +0400 Subject: tty: move pty count limiting into devpts Let's move this stuff to the better place, where we can account pty right in tty-indexes managing code. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 51 --------------------------------------------------- fs/devpts/inode.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 03147fa31d4..d505837b347 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -439,55 +438,9 @@ static inline void legacy_pty_init(void) { } /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS -/* - * sysctl support for setting limits on the number of Unix98 ptys allocated. - * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. - */ -int pty_limit = NR_UNIX98_PTY_DEFAULT; -static int pty_limit_min; -static int pty_limit_max = NR_UNIX98_PTY_MAX; -static int pty_count; static struct cdev ptmx_cdev; -static struct ctl_table pty_table[] = { - { - .procname = "max", - .maxlen = sizeof(int), - .mode = 0644, - .data = &pty_limit, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pty_limit_min, - .extra2 = &pty_limit_max, - }, { - .procname = "nr", - .maxlen = sizeof(int), - .mode = 0444, - .data = &pty_count, - .proc_handler = proc_dointvec, - }, - {} -}; - -static struct ctl_table pty_kern_table[] = { - { - .procname = "pty", - .mode = 0555, - .child = pty_table, - }, - {} -}; - -static struct ctl_table pty_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = pty_kern_table, - }, - {} -}; - - static int pty_unix98_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { @@ -587,7 +540,6 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) */ tty_driver_kref_get(driver); tty->count++; - pty_count++; return 0; err_free_mem: deinitialize_tty_struct(o_tty); @@ -601,7 +553,6 @@ err_free_tty: static void ptm_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { - pty_count--; } static void pts_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) @@ -760,8 +711,6 @@ static void __init unix98_pty_init(void) if (tty_register_driver(pts_driver)) panic("Couldn't register Unix98 pts driver"); - register_sysctl_table(pty_root_table); - /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c4e2a58a2e8..c2c7317d568 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -36,7 +36,52 @@ #define DEVPTS_DEFAULT_PTMX_MODE 0000 #define PTMX_MINOR 2 -extern int pty_limit; /* Config limit on Unix98 ptys */ +/* + * sysctl support for setting limits on the number of Unix98 ptys allocated. + * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. + */ +static int pty_limit = NR_UNIX98_PTY_DEFAULT; +static int pty_limit_min; +static int pty_limit_max = NR_UNIX98_PTY_MAX; +static int pty_count; + +static struct ctl_table pty_table[] = { + { + .procname = "max", + .maxlen = sizeof(int), + .mode = 0644, + .data = &pty_limit, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pty_limit_min, + .extra2 = &pty_limit_max, + }, { + .procname = "nr", + .maxlen = sizeof(int), + .mode = 0444, + .data = &pty_count, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_table pty_kern_table[] = { + { + .procname = "pty", + .mode = 0555, + .child = pty_table, + }, + {} +}; + +static struct ctl_table pty_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = pty_kern_table, + }, + {} +}; + static DEFINE_MUTEX(allocated_ptys_lock); static struct vfsmount *devpts_mnt; @@ -451,6 +496,7 @@ retry: mutex_unlock(&allocated_ptys_lock); return -EIO; } + pty_count++; mutex_unlock(&allocated_ptys_lock); return index; } @@ -462,6 +508,7 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); + pty_count--; mutex_unlock(&allocated_ptys_lock); } @@ -558,11 +605,15 @@ void devpts_pty_kill(struct tty_struct *tty) static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); + struct ctl_table_header *table; + if (!err) { + table = register_sysctl_table(pty_root_table); devpts_mnt = kern_mount(&devpts_fs_type); if (IS_ERR(devpts_mnt)) { err = PTR_ERR(devpts_mnt); unregister_filesystem(&devpts_fs_type); + unregister_sysctl_table(table); } } return err; -- cgit v1.2.3-70-g09d2 From e9aba5158a80098447ff207a452a3418ae7ee386 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 5 Jan 2012 13:06:11 +0400 Subject: tty: rework pty count limiting After adding devpts multiple-insrances sysctl kernel.pty.max limit pty count for each devpts instance independently, while kernel.pty.nr shows total pty count. This patch restores sysctl kernel.pty.max as global limit (4096 by default), adds pty reseve for main devpts (mounted without "newinstance" argument), and new sysctl to tune it: kernel.pty.reserve (1024 by default) Also it adds devpts mount option "max=%d" to limit pty count for each devpts instance independently. (by default NR_UNIX98_PTY_MAX == 2^20) Thus devpts instances in containers cannot eat up all available pty even if we didn't set any limits, while with "max" argument we can adjust limits more precisely. Plus, now open("/dev/ptmx") return -ENOSPC in case lack of pty indexes, this is more informative than -EIO. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 34 ++++++++++++++++++++++++++++++---- include/linux/tty.h | 1 + 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c2c7317d568..1c6f908e38c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -41,8 +41,9 @@ * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. */ static int pty_limit = NR_UNIX98_PTY_DEFAULT; +static int pty_reserve = NR_UNIX98_PTY_RESERVE; static int pty_limit_min; -static int pty_limit_max = NR_UNIX98_PTY_MAX; +static int pty_limit_max = INT_MAX; static int pty_count; static struct ctl_table pty_table[] = { @@ -54,6 +55,14 @@ static struct ctl_table pty_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &pty_limit_min, .extra2 = &pty_limit_max, + }, { + .procname = "reserve", + .maxlen = sizeof(int), + .mode = 0644, + .data = &pty_reserve, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pty_limit_min, + .extra2 = &pty_limit_max, }, { .procname = "nr", .maxlen = sizeof(int), @@ -94,10 +103,11 @@ struct pts_mount_opts { umode_t mode; umode_t ptmxmode; int newinstance; + int max; }; enum { - Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, + Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_max, Opt_err }; @@ -108,6 +118,7 @@ static const match_table_t tokens = { #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, + {Opt_max, "max=%d"}, #endif {Opt_err, NULL} }; @@ -154,6 +165,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) opts->gid = 0; opts->mode = DEVPTS_DEFAULT_MODE; opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + opts->max = NR_UNIX98_PTY_MAX; /* newinstance makes sense only on initial mount */ if (op == PARSE_MOUNT) @@ -197,6 +209,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) if (op == PARSE_MOUNT) opts->newinstance = 1; break; + case Opt_max: + if (match_int(&args[0], &option) || + option < 0 || option > NR_UNIX98_PTY_MAX) + return -EINVAL; + opts->max = option; + break; #endif default: printk(KERN_ERR "devpts: called with bogus options\n"); @@ -303,6 +321,8 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",mode=%03o", opts->mode); #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); + if (opts->max < NR_UNIX98_PTY_MAX) + seq_printf(seq, ",max=%d", opts->max); #endif return 0; @@ -483,6 +503,12 @@ retry: return -ENOMEM; mutex_lock(&allocated_ptys_lock); + if (pty_count >= pty_limit - + (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + mutex_unlock(&allocated_ptys_lock); + return -ENOSPC; + } + ida_ret = ida_get_new(&fsi->allocated_ptys, &index); if (ida_ret < 0) { mutex_unlock(&allocated_ptys_lock); @@ -491,10 +517,10 @@ retry: return -EIO; } - if (index >= pty_limit) { + if (index >= fsi->mount_opts.max) { ida_remove(&fsi->allocated_ptys, index); mutex_unlock(&allocated_ptys_lock); - return -EIO; + return -ENOSPC; } pty_count++; mutex_unlock(&allocated_ptys_lock); diff --git a/include/linux/tty.h b/include/linux/tty.h index d3ebd765b54..d4077418820 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -52,6 +52,7 @@ * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ +#define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* -- cgit v1.2.3-70-g09d2 From c56d8a7362665d165ba992b6b7a8d6c13a26eafc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 17 Jan 2012 12:17:22 +0000 Subject: sysfs: change permissions for /sys from 0755 to 0555 There is a misleading difference between /proc and /sys permissions, /proc is 0555 and /sys is 0755. But as it is impossible to create or unlink something in /sys it would be nice to have same permissions. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e34f0d99ea4..140f26a3428 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -36,7 +36,7 @@ struct sysfs_dirent sysfs_root = { .s_name = "", .s_count = ATOMIC_INIT(1), .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), - .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, .s_ino = 1, }; -- cgit v1.2.3-70-g09d2 From d6e486868cde585842d55ba3b6ec57af090fc343 Mon Sep 17 00:00:00 2001 From: Ludwig Nussel Date: Wed, 25 Jan 2012 11:52:28 +0100 Subject: debugfs: add mode, uid and gid options Cautious admins may want to restrict access to debugfs. Currently a manual chown/chmod e.g. in an init script is needed to achieve that. Distributions that want to make the mount options configurable need to add extra config files. By allowing to set the root inode's uid, gid and mode via mount options no such hacks are needed anymore. Instead configuration becomes straight forward via fstab. Signed-off-by: Ludwig Nussel Signed-off-by: Greg Kroah-Hartman --- Documentation/filesystems/debugfs.txt | 5 +- fs/debugfs/inode.c | 149 +++++++++++++++++++++++++++++++++- 2 files changed, 152 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt index 6872c91bce3..4e257587318 100644 --- a/Documentation/filesystems/debugfs.txt +++ b/Documentation/filesystems/debugfs.txt @@ -14,7 +14,10 @@ Debugfs is typically mounted with a command like: mount -t debugfs none /sys/kernel/debug -(Or an equivalent /etc/fstab line). +(Or an equivalent /etc/fstab line). +The debugfs root directory is accessible by anyone by default. To +restrict access to the tree the "uid", "gid" and "mode" mount +options can be used. Note that the debugfs API is exported GPL-only to modules. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 956d5ddddf6..b80bc846a15 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -23,9 +23,13 @@ #include #include #include +#include +#include #include #include +#define DEBUGFS_DEFAULT_MODE 0755 + static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; @@ -125,11 +129,154 @@ static inline int debugfs_positive(struct dentry *dentry) return dentry->d_inode && !d_unhashed(dentry); } +struct debugfs_mount_opts { + uid_t uid; + gid_t gid; + umode_t mode; +}; + +enum { + Opt_uid, + Opt_gid, + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct debugfs_fs_info { + struct debugfs_mount_opts mount_opts; +}; + +static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + char *p; + + opts->mode = DEBUGFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->uid = option; + break; + case Opt_gid: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->gid = option; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally debugfs has ignored all mount options + */ + } + } + + return 0; +} + +static int debugfs_apply_options(struct super_block *sb) +{ + struct debugfs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = sb->s_root->d_inode; + struct debugfs_mount_opts *opts = &fsi->mount_opts; + + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; + + return 0; +} + +static int debugfs_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct debugfs_fs_info *fsi = sb->s_fs_info; + + err = debugfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + debugfs_apply_options(sb); + +fail: + return err; +} + +static int debugfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; + struct debugfs_mount_opts *opts = &fsi->mount_opts; + + if (opts->uid != 0) + seq_printf(m, ",uid=%u", opts->uid); + if (opts->gid != 0) + seq_printf(m, ",gid=%u", opts->gid); + if (opts->mode != DEBUGFS_DEFAULT_MODE) + seq_printf(m, ",mode=%o", opts->mode); + + return 0; +} + +static const struct super_operations debugfs_super_operations = { + .statfs = simple_statfs, + .remount_fs = debugfs_remount, + .show_options = debugfs_show_options, +}; + static int debug_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr debug_files[] = {{""}}; + struct debugfs_fs_info *fsi; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) { + err = -ENOMEM; + goto fail; + } + + err = debugfs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); + if (err) + goto fail; + + sb->s_op = &debugfs_super_operations; + + debugfs_apply_options(sb); + + return 0; - return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); +fail: + kfree(fsi); + sb->s_fs_info = NULL; + return err; } static struct dentry *debug_mount(struct file_system_type *fs_type, -- cgit v1.2.3-70-g09d2 From d5c38b137ac8a6e3dbed13bc494d60df5b69dfc4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Jan 2012 06:40:26 -0800 Subject: sysfs: Update the name hash when renaming sysfs entries This fixes a bug introduced with sysfs name hashes where renaming a network device appears to succeed but silently makes the sysfs files for that network device inaccessible. In at least one configuration this bug has stopped networking from coming up during boot. Signed-off-by: Eric W. Biederman Tested-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ea64d01400a..dd3779cf3a3 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -872,6 +872,7 @@ int sysfs_rename(struct sysfs_dirent *sd, dup_name = sd->s_name; sd->s_name = new_name; + sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); } /* Move to the appropriate place in the appropriate directories rbtree. */ -- cgit v1.2.3-70-g09d2 From 8112b9830a056c3f42423e4e8e914ac9f7162dce Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 22 Jan 2012 23:27:00 +0900 Subject: reiserfs: fix printk typo in lbalance.c Correct spelling "entry_cout" to "entry_count" in fs/reiserfs/lbalance.c Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/reiserfs/lbalance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 03d85cbf90b..b43d0155631 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -975,7 +975,7 @@ static int leaf_cut_entries(struct buffer_head *bh, remove */ RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); RFALSE(I_ENTRY_COUNT(ih) < from + del_count, - "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", + "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", I_ENTRY_COUNT(ih), from, del_count); if (del_count == 0) -- cgit v1.2.3-70-g09d2 From 982a598ff68acad37647baba06668054568eee49 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 24 Jan 2012 02:29:36 +0900 Subject: ntfs: fix printk typos in mft.c Correct two spelling errors "dealocate" to "deallocate" in fs/ntfs/mft.c Signed-off-by: Jiri Kosina --- fs/ntfs/mft.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 382857f9c7d..862f7ff57b7 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft " "bitmap."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate " + ntfs_error(vol->sb, "Failed to deallocate " "allocated cluster.%s", es); NVolSetErrors(vol); } @@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft data " "attribute."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate clusters " + ntfs_error(vol->sb, "Failed to deallocate clusters " "from the mft data attribute.%s", es); NVolSetErrors(vol); } -- cgit v1.2.3-70-g09d2 From 934e7d44b810691ae5aefa3308b97a402aac1a55 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 7 Feb 2012 22:21:45 +0900 Subject: btrfs: Fix typo in free-space-cache.c Correct spelling "cace" to "cache" in fs/btrfs/free-space-cache.c Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c2f20594c9f..7f4f3025357 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1067,7 +1067,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); ret = 0; #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cace " + printk(KERN_ERR "btrfs: failed to write free space cache " "for block group %llu\n", block_group->key.objectid); #endif } -- cgit v1.2.3-70-g09d2 From 42ea19790e82498e14a24e97b7cf2a83d89203b6 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 8 Feb 2012 20:39:39 +0900 Subject: jffs2: Fix typo in compr.c Correct spelling "modul" to "module" in fs/hffs2/compr.c Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/jffs2/compr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 5b6c9d1a2fb..96ed3c9ec3f 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -340,7 +340,7 @@ int jffs2_unregister_compressor(struct jffs2_compressor *comp) if (comp->usecount) { spin_unlock(&jffs2_compressor_list_lock); - printk(KERN_WARNING "JFFS2: Compressor modul is in use. Unregister failed.\n"); + printk(KERN_WARNING "JFFS2: Compressor module is in use. Unregister failed.\n"); return -1; } list_del(&comp->list); -- cgit v1.2.3-70-g09d2 From 3e93b8dfd9dd8735152e59913a2bde226f83d43e Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 5 Feb 2012 01:29:47 +0100 Subject: BTRFS: Don't include disk-io.h twice in check-integrity.c Once should be enough. Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- fs/btrfs/check-integrity.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b669a7d8e49..064b29bd160 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -89,7 +89,6 @@ #include "disk-io.h" #include "transaction.h" #include "extent_io.h" -#include "disk-io.h" #include "volumes.h" #include "print-tree.h" #include "locking.h" -- cgit v1.2.3-70-g09d2 From 0cc785ecbf6c04c1ef01c311accee859c856a6b9 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sat, 11 Feb 2012 21:35:12 +0900 Subject: cramfs: Fix typo in inode.c Correct spelling "endianess" to "endianness" in fs/cramfs/inode.c Signed-off-by: Masanari Iida Signed-off-by: Jiri Kosina --- fs/cramfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a2ee8f9f5a3..04d51f9333d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -257,10 +257,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* Do sanity checks on the superblock */ if (super.magic != CRAMFS_MAGIC) { - /* check for wrong endianess */ + /* check for wrong endianness */ if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) - printk(KERN_ERR "cramfs: wrong endianess\n"); + printk(KERN_ERR "cramfs: wrong endianness\n"); goto out; } @@ -270,7 +270,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&read_mutex); if (super.magic != CRAMFS_MAGIC) { if (super.magic == CRAMFS_MAGIC_WEND && !silent) - printk(KERN_ERR "cramfs: wrong endianess\n"); + printk(KERN_ERR "cramfs: wrong endianness\n"); else if (!silent) printk(KERN_ERR "cramfs: wrong magic\n"); goto out; -- cgit v1.2.3-70-g09d2 From a80581d0d1b11b2d4bbb9333c1cac5416714112d Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Sat, 11 Feb 2012 05:55:58 -0800 Subject: Typos: change aditional to additional. The below patch fixes some typos "aditional" to "additional", and also fixes a comment with another word mispelled. Signed-off-by: Justin P. Mattock Signed-off-by: Jiri Kosina --- arch/powerpc/include/asm/keylargo.h | 2 +- drivers/hwmon/tmp401.c | 2 +- fs/ntfs/layout.h | 4 ++-- sound/pci/hda/patch_conexant.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/include/asm/keylargo.h b/arch/powerpc/include/asm/keylargo.h index fc195d0b3c3..2156315d8a9 100644 --- a/arch/powerpc/include/asm/keylargo.h +++ b/arch/powerpc/include/asm/keylargo.h @@ -21,7 +21,7 @@ #define KEYLARGO_FCR4 0x48 #define KEYLARGO_FCR5 0x4c /* Pangea only */ -/* K2 aditional FCRs */ +/* K2 additional FCRs */ #define K2_FCR6 0x34 #define K2_FCR7 0x30 #define K2_FCR8 0x2c diff --git a/drivers/hwmon/tmp401.c b/drivers/hwmon/tmp401.c index 8b9a77486d5..951442adc06 100644 --- a/drivers/hwmon/tmp401.c +++ b/drivers/hwmon/tmp401.c @@ -624,7 +624,7 @@ static int tmp401_probe(struct i2c_client *client, goto exit_remove; } - /* Register aditional tmp411 sysfs hooks */ + /* Register additional tmp411 sysfs hooks */ if (data->kind == tmp411) { for (i = 0; i < ARRAY_SIZE(tmp411_attr); i++) { err = device_create_file(&client->dev, diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index faece719086..809c0e6d8e0 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -2008,14 +2008,14 @@ typedef struct { * * When a directory is small enough to fit inside the index root then this * is the only attribute describing the directory. When the directory is too - * large to fit in the index root, on the other hand, two aditional attributes + * large to fit in the index root, on the other hand, two additional attributes * are present: an index allocation attribute, containing sub-nodes of the B+ * directory tree (see below), and a bitmap attribute, describing which virtual * cluster numbers (vcns) in the index allocation attribute are in use by an * index block. * * NOTE: The root directory (FILE_root) contains an entry for itself. Other - * dircetories do not contain entries for themselves, though. + * directories do not contain entries for themselves, though. */ typedef struct { ATTR_TYPE type; /* Type of the indexed attribute. Is diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index a7a5733aa4d..51e3ed4527c 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1643,7 +1643,7 @@ static void cxt5051_update_speaker(struct hda_codec *codec) pinctl = (!spec->hp_present && spec->cur_eapd) ? PIN_OUT : 0; snd_hda_codec_write(codec, 0x1a, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, pinctl); - /* on ideapad there is an aditional speaker (subwoofer) to mute */ + /* on ideapad there is an additional speaker (subwoofer) to mute */ if (spec->ideapad) snd_hda_codec_write(codec, 0x1b, 0, AC_VERB_SET_PIN_WIDGET_CONTROL, -- cgit v1.2.3-70-g09d2 From 4ff16c25e2cc48cbe6956e356c38a25ac063a64d Mon Sep 17 00:00:00 2001 From: David Smith Date: Tue, 7 Feb 2012 10:11:05 -0600 Subject: tracepoint, vfs, sched: Add exec() tracepoint Added a minimal exec tracepoint. Exec is an important major event in the life of a task, like fork(), clone() or exit(), all of which we already trace. [ We also do scheduling re-balancing during exec() - so it's useful from a scheduler instrumentation POV as well. ] If you want to watch a task start up, when it gets exec'ed is a good place to start. With the addition of this tracepoint, exec's can be monitored and better picture of general system activity can be obtained. This tracepoint will also enable better process life tracking, allowing you to answer questions like "what process keeps starting up binary X?". This tracepoint can also be useful in ftrace filtering and trigger conditions: i.e. starting or stopping filtering when exec is called. Signed-off-by: David Smith Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Al Viro Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F314D19.7030504@redhat.com Signed-off-by: Ingo Molnar --- fs/exec.c | 9 ++++++--- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index aeb135c7ff5..d0d20809277 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -63,6 +63,8 @@ #include #include "internal.h" +#include + int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; @@ -1401,9 +1403,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) */ bprm->recursion_depth = depth; if (retval >= 0) { - if (depth == 0) - ptrace_event(PTRACE_EVENT_EXEC, - old_pid); + if (depth == 0) { + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_pid); + } put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6ba596b07a7..e61ddfe8fe9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #include #include +#include /* * Tracepoint for calling kthread_stop, performed to end a kthread: @@ -275,6 +276,32 @@ TRACE_EVENT(sched_process_fork, __entry->child_comm, __entry->child_pid) ); +/* + * Tracepoint for exec: + */ +TRACE_EVENT(sched_process_exec, + + TP_PROTO(struct task_struct *p, pid_t old_pid, + struct linux_binprm *bprm), + + TP_ARGS(p, old_pid, bprm), + + TP_STRUCT__entry( + __string( filename, bprm->filename ) + __field( pid_t, pid ) + __field( pid_t, old_pid ) + ), + + TP_fast_assign( + __assign_str(filename, bprm->filename); + __entry->pid = p->pid; + __entry->old_pid = p->pid; + ), + + TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), + __entry->pid, __entry->old_pid) +); + /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. -- cgit v1.2.3-70-g09d2 From 93518dd2ebafcc761a8637b2877008cfd748c202 Mon Sep 17 00:00:00 2001 From: Masami Ichikawa Date: Tue, 21 Feb 2012 07:43:50 +0900 Subject: sysfs: Fix memory leak in sysfs_sd_setsecdata(). This patch fixies follwing two memory leak patterns that reported by kmemleak. sysfs_sd_setsecdata() is called during sys_lsetxattr() operation. It checks sd->s_iattr is NULL or not. Then if it is NULL, it calls sysfs_init_inode_attrs() to allocate memory. That code is this. iattrs = sd->s_iattr; if (!iattrs) iattrs = sysfs_init_inode_attrs(sd); The iattrs recieves sysfs_init_inode_attrs()'s result, but sd->s_iattr doesn't know the address. so it needs to set correct address to sd->s_iattr to free memory in other function. unreferenced object 0xffff880250b73e60 (size 32): comm "systemd", pid 1, jiffies 4294683888 (age 94.553s) hex dump (first 32 bytes): 73 79 73 74 65 6d 5f 75 3a 6f 62 6a 65 63 74 5f system_u:object_ 72 3a 73 79 73 66 73 5f 74 3a 73 30 00 00 00 00 r:sysfs_t:s0.... backtrace: [] kmemleak_alloc+0x73/0x98 [] __kmalloc+0x100/0x12c [] context_struct_to_string+0x106/0x210 [] security_sid_to_context_core+0x10b/0x129 [] security_sid_to_context+0x10/0x12 [] selinux_inode_getsecurity+0x7d/0xa8 [] selinux_inode_getsecctx+0x22/0x2e [] security_inode_getsecctx+0x16/0x18 [] sysfs_setxattr+0x96/0x117 [] __vfs_setxattr_noperm+0x73/0xd9 [] vfs_setxattr+0x83/0xa1 [] setxattr+0xcf/0x101 [] sys_lsetxattr+0x6a/0x8f [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff unreferenced object 0xffff88024163c5a0 (size 96): comm "systemd", pid 1, jiffies 4294683888 (age 94.553s) hex dump (first 32 bytes): 00 00 00 00 ed 41 00 00 00 00 00 00 00 00 00 00 .....A.......... 00 00 00 00 00 00 00 00 0c 64 42 4f 00 00 00 00 .........dBO.... backtrace: [] kmemleak_alloc+0x73/0x98 [] kmem_cache_alloc_trace+0xc4/0xee [] sysfs_init_inode_attrs+0x2a/0x83 [] sysfs_setxattr+0xbf/0x117 [] __vfs_setxattr_noperm+0x73/0xd9 [] vfs_setxattr+0x83/0xa1 [] setxattr+0xcf/0x101 [] sys_lsetxattr+0x6a/0x8f [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff ` Signed-off-by: Masami Ichikawa Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4291fd1617a..cc7ea5de2fd 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -136,12 +136,13 @@ static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *sec void *old_secdata; size_t old_secdata_len; - iattrs = sd->s_iattr; - if (!iattrs) - iattrs = sysfs_init_inode_attrs(sd); - if (!iattrs) - return -ENOMEM; + if (!sd->s_iattr) { + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + } + iattrs = sd->s_iattr; old_secdata = iattrs->ia_secdata; old_secdata_len = iattrs->ia_secdata_len; -- cgit v1.2.3-70-g09d2 From fe316bf2d5847bc5dd975668671a7b1067603bc7 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Fri, 2 Mar 2012 10:38:33 +0100 Subject: block: Fix NULL pointer dereference in sd_revalidate_disk Since 2.6.39 (1196f8b), when a driver returns -ENOMEDIUM for open(), __blkdev_get() calls rescan_partitions() to remove in-kernel partition structures and raise KOBJ_CHANGE uevent. However it ends up calling driver's revalidate_disk without open and could cause oops. In the case of SCSI: process A process B ---------------------------------------------- sys_open __blkdev_get sd_open returns -ENOMEDIUM scsi_remove_device rescan_partitions sd_revalidate_disk Oopses are reported here: http://marc.info/?l=linux-scsi&m=132388619710052 This patch separates the partition invalidation from rescan_partitions() and use it for -ENOMEDIUM case. Reported-by: Huajun Li Signed-off-by: Jun'ichi Nomura Acked-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/partition-generic.c | 48 +++++++++++++++++++++++++++++++++++++++-------- fs/block_dev.c | 16 ++++++++++++---- include/linux/genhd.h | 1 + 3 files changed, 53 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/block/partition-generic.c b/block/partition-generic.c index d06ec1c829c..6df5d6928a4 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -389,17 +389,11 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +static int drop_partitions(struct gendisk *disk, struct block_device *bdev) { - struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - kfree(state); - state = NULL; - } + int res; if (bdev->bd_part_count) return -EBUSY; @@ -412,6 +406,24 @@ rescan: delete_partition(disk, part->partno); disk_part_iter_exit(&piter); + return 0; +} + +int rescan_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state = NULL; + struct hd_struct *part; + int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } + + res = drop_partitions(disk, bdev); + if (res) + return res; + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); check_disk_size_change(disk, bdev); @@ -515,6 +527,26 @@ rescan: return 0; } +int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) +{ + int res; + + if (!bdev->bd_invalidated) + return 0; + + res = drop_partitions(disk, bdev); + if (res) + return res; + + set_capacity(disk, 0); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + return 0; +} + unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) { struct address_space *mapping = bdev->bd_inode->i_mapping; diff --git a/fs/block_dev.c b/fs/block_dev.c index 0e575d1304b..5e9f198f771 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1183,8 +1183,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } if (ret) goto out_clear; } else { @@ -1214,8 +1218,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(bdev->bd_disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } if (ret) goto out_unlock_bdev; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fe23ee76858..e61d3192448 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -596,6 +596,7 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf); extern int disk_expand_part_tbl(struct gendisk *disk, int target); extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); +extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev); extern struct hd_struct * __must_check add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, -- cgit v1.2.3-70-g09d2 From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Feb 2012 17:41:27 +0900 Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice() Pass nice as a value to proc_sched_autogroup_set_nice(). No side effect is expected, and the variable err will be overwritten with the return value. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 3 +-- include/linux/sched.h | 2 +- kernel/sched/auto_group.c | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b0..965d4bde3a3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1310,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = nice; - err = proc_sched_autogroup_set_nice(p, &err); + err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; diff --git a/include/linux/sched.h b/include/linux/sched.h index c628a915143..c298fb9cf5a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2065,7 +2065,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); #endif #else static inline void sched_autogroup_create_attach(struct task_struct *p) { } diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e..0984a21076a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < -20 || nice > 19) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); -- cgit v1.2.3-70-g09d2 From 4b32da2bcf1de2b7a196a0e48389d231b4472c36 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 4 Mar 2012 12:56:55 +0000 Subject: ppp: Replace uses of with Since all that include/linux/if_ppp.h does is #include , this replaces the occurrences of #include with #include . It also corrects an error in Documentation/networking/l2tp.txt, where it referenced include/linux/if_ppp.h as the source of some definitions that are actually now defined in include/linux/if_pppol2tp.h. Signed-off-by: Paul Mackerras Signed-off-by: David S. Miller --- Documentation/networking/l2tp.txt | 2 +- drivers/isdn/capi/capi.c | 2 +- drivers/net/ppp/ppp_async.c | 2 +- drivers/net/ppp/ppp_synctty.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- drivers/net/ppp/pppox.c | 2 +- drivers/tty/ipwireless/network.c | 2 +- drivers/tty/ipwireless/tty.c | 2 +- fs/compat_ioctl.c | 2 +- include/linux/isdn.h | 2 +- net/atm/pppoatm.c | 2 +- net/irda/irnet/irnet.h | 2 +- net/l2tp/l2tp_ppp.c | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt index e7bf3979fac..e63fc1f7bf8 100644 --- a/Documentation/networking/l2tp.txt +++ b/Documentation/networking/l2tp.txt @@ -111,7 +111,7 @@ When creating PPPoL2TP sockets, the application provides information to the driver about the socket in a socket connect() call. Source and destination tunnel and session ids are provided, as well as the file descriptor of a UDP socket. See struct pppol2tp_addr in -include/linux/if_ppp.h. Note that zero tunnel / session ids are +include/linux/if_pppol2tp.h. Note that zero tunnel / session ids are treated specially. When creating the per-tunnel PPPoL2TP management socket in Step 2 above, zero source and destination session ids are specified, which tells the driver to prepare the supplied UDP file diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index d33a70c4918..0cf05464bfb 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c index c6ba6438082..af95a98fd86 100644 --- a/drivers/net/ppp/ppp_async.c +++ b/drivers/net/ppp/ppp_async.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index 736a39ee05b..55e466c511d 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index bc9a4bb3198..2fa1a9b6f49 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -72,7 +72,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 8c0d170dabc..2940e9fe351 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/tty/ipwireless/network.c b/drivers/tty/ipwireless/network.c index f7daeea598e..57c8b481113 100644 --- a/drivers/tty/ipwireless/network.c +++ b/drivers/tty/ipwireless/network.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include "network.h" diff --git a/drivers/tty/ipwireless/tty.c b/drivers/tty/ipwireless/tty.c index ef92869502a..2ffa0b77770 100644 --- a/drivers/tty/ipwireless/tty.c +++ b/drivers/tty/ipwireless/tty.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a26bea10e81..10d8cd90ca6 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/isdn.h b/include/linux/isdn.h index 4ccf95d681b..292f27a793d 100644 --- a/include/linux/isdn.h +++ b/include/linux/isdn.h @@ -187,7 +187,7 @@ typedef struct { #endif #include -#include +#include #include #endif diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index df35d9a3b5f..614d3fc47ed 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index 979ecb2435a..564eb0b8afa 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -254,7 +254,7 @@ #include #include -#include +#include #include #include diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 8a90d756c90..96bc7a67585 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -82,7 +82,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From c097b2ca5140249abc3fb5ae9a545c35125ae8d0 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Mon, 5 Mar 2012 15:08:06 -0800 Subject: writeback: fix fn name in writeback_inodes_sb_nr_if_idle() comment header Signed-off-by: Fengguang Wu Signed-off-by: Jiri Kosina --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f855916657b..82e959da686 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1284,7 +1284,7 @@ int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb_if_idle); /** - * writeback_inodes_sb_if_idle - start writeback if none underway + * writeback_inodes_sb_nr_if_idle - start writeback if none underway * @sb: the superblock * @nr: the number of pages to write * @reason: reason why some writeback work was initiated -- cgit v1.2.3-70-g09d2 From 54d20f006ceff1f2f1e69d0e54049b6c0765c039 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Mar 2012 13:03:10 -0800 Subject: Revert "sysfs: Kill nlink counting." This reverts commit 524b6c5b39b931311dfe5a2f5abae2f5c9731676. It has shown to break userspace tools, which is not acceptable. Reported-by: Jiri Slaby Cc: Eric W. Biederman Cc: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 6 ++++++ fs/sysfs/inode.c | 3 +++ fs/sysfs/sysfs.h | 1 + 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index dd3779cf3a3..2a7a3f5d1ca 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -91,6 +91,9 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; struct rb_node *parent = NULL; + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs++; + while (*node) { struct sysfs_dirent *pos; int result; @@ -123,6 +126,9 @@ static int sysfs_link_sibling(struct sysfs_dirent *sd) */ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) { + if (sysfs_type(sd) == SYSFS_DIR) + sd->s_parent->s_dir.subdirs--; + rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index cc7ea5de2fd..feb2d69396c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -217,6 +217,9 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) iattrs->ia_secdata, iattrs->ia_secdata_len); } + + if (sysfs_type(sd) == SYSFS_DIR) + set_nlink(inode, sd->s_dir.subdirs + 2); } int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 6289a00287d..661a9639570 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -19,6 +19,7 @@ struct sysfs_open_dirent; struct sysfs_elem_dir { struct kobject *kobj; + unsigned long subdirs; /* children rbtree starts here and goes through sd->s_rb */ struct rb_root children; }; -- cgit v1.2.3-70-g09d2 From 2f2d76cc3e938389feee671b46252dde6880b3b7 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Thu, 8 Mar 2012 05:55:59 +0000 Subject: dlm: Do not allocate a fd for peeloff avoids allocating a fd that a) propagates to every kernel thread and usermodehelper b) is not properly released. References: http://article.gmane.org/gmane.linux.network.drbd/22529 Signed-off-by: Benjamin Poirier Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0b3109ee425..ca0c59a4246 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -474,9 +475,6 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - sctp_peeloff_arg_t parg; - int parglen = sizeof(parg); - int err; /* * We get this before any data for an association. @@ -525,23 +523,19 @@ static void process_sctp_notification(struct connection *con, return; /* Peel off a new sock */ - parg.associd = sn->sn_assoc_change.sac_assoc_id; - ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, - SCTP_SOCKOPT_PEELOFF, - (void *)&parg, &parglen); + sctp_lock_sock(con->sock->sk); + ret = sctp_do_peeloff(con->sock->sk, + sn->sn_assoc_change.sac_assoc_id, + &new_con->sock); + sctp_release_sock(con->sock->sk); if (ret < 0) { log_print("Can't peel off a socket for " "connection %d to node %d: err=%d", - parg.associd, nodeid, ret); - return; - } - new_con->sock = sockfd_lookup(parg.sd, &err); - if (!new_con->sock) { - log_print("sockfd_lookup error %d", err); + (int)sn->sn_assoc_change.sac_assoc_id, + nodeid, ret); return; } add_sock(new_con->sock, new_con); - sockfd_put(new_con->sock); log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); -- cgit v1.2.3-70-g09d2 From bfcfaa77bdf0f775263e906015982a608df01c76 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Mar 2012 11:16:17 -0800 Subject: vfs: use 'unsigned long' accesses for dcache name comparison and hashing Ok, this is hacky, and only works on little-endian machines with goo unaligned handling. And even then only with CONFIG_DEBUG_PAGEALLOC disabled, since it can access up to 7 bytes after the pathname. But it runs like a bat out of hell. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + fs/Kconfig | 4 ++ fs/dcache.c | 23 +++++++++++ fs/namei.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+) (limited to 'fs') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189f..09675d3e0ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP + select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9..aa195265362 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@ menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS + bool + if BLOCK source "fs/ext2/Kconfig" diff --git a/fs/dcache.c b/fs/dcache.c index bcbdb33fcc2..ffd47a16d87 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, static inline int dentry_cmp(const unsigned char *cs, size_t scount, const unsigned char *ct, size_t tcount) { +#ifdef CONFIG_DCACHE_WORD_ACCESS + unsigned long a,b,mask; + + if (unlikely(scount != tcount)) + return 1; + + for (;;) { + a = *(unsigned long *)cs; + b = *(unsigned long *)ct; + if (tcount < sizeof(unsigned long)) + break; + if (unlikely(a != b)) + return 1; + cs += sizeof(unsigned long); + ct += sizeof(unsigned long); + tcount -= sizeof(unsigned long); + if (!tcount) + return 0; + } + mask = ~(~0ul << tcount*8); + return unlikely(!!((a ^ b) & mask)); +#else if (scount != tcount) return 1; @@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, tcount--; } while (tcount); return 0; +#endif } static void __d_free(struct rcu_head *head) diff --git a/fs/namei.c b/fs/namei.c index e2ba62820a0..378497a744b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode) return 1; } +/* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + * do a "get_unaligned()" if this helps and is sufficiently + * fast. + * + * - Little-endian machines (so that we can generate the mask + * of low bytes efficiently). Again, we *could* do a byte + * swapping load on big-endian architectures if that is not + * expensive enough to make the optimization worthless. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + * do not trap on the (extremely unlikely) case of a page + * crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + * 64-bit case in order to generate the "number of bytes in + * the final mask". Again, that could be replaced with a + * efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ + return mask*0x0001020304050608 >> 56; +} + +static inline unsigned int fold_hash(unsigned long hash) +{ + hash += hash >> (8*sizeof(int)); + return hash; +} + +#else /* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; +} + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long a, mask; + unsigned long hash = 0; + + for (;;) { + a = *(unsigned long *)name; + hash *= 9; + if (len < sizeof(unsigned long)) + break; + hash += a; + name += sizeof(unsigned long); + len -= sizeof(unsigned long); + if (!len) + goto done; + } + mask = ~(~0ul << len*8); + hash += mask & a; +done: + return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +#define ONEBYTES 0x0101010101010101ul +#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful +#define HIGHBITS 0x8080808080808080ul + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ + return ((a - ONEBYTES) & ~a) & HIGHBITS; +} + +/* + * Calculate the length and hash of the path component, and + * return the length of the component; + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long a, mask, hash, len; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = (hash + a) * 9; + len += sizeof(unsigned long); + a = *(unsigned long *)(name+len); + /* Do we have any NUL or '/' bytes in this word? */ + mask = has_zero(a) | has_zero(a ^ SLASHBYTES); + } while (!mask); + + /* The mask *below* the first high bit set */ + mask = (mask - 1) & ~mask; + mask >>= 7; + hash += a & mask; + *hashp = fold_hash(hash); + + return len + count_masked_bytes(mask); +} + +#else + unsigned int full_name_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); @@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) return len; } +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into -- cgit v1.2.3-70-g09d2 From 2c724fb92732c0b2a5629eb8af74e82eb62ac947 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 16 Mar 2012 10:28:07 +0000 Subject: afs: Read of file returns EBADMSG A read of a large file on an afs mount failed: # cat junk.file > /dev/null cat: junk.file: Bad message Looking at the trace, call->offset wrapped since it is only an unsigned short. In afs_extract_data: _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); ... if (call->offset < count) { if (last) { _leave(" = -EBADMSG [%d < %zu]", call->offset, count); return -EBADMSG; } Which matches the trace: [cat ] ==> afs_extract_data({65132},{524},1,,65536) [cat ] <== afs_extract_data() = -EBADMSG [0 < 65536] call->offset went from 65132 to 0. Fix this by making call->offset an unsigned int. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Cc: Signed-off-by: Linus Torvalds --- fs/afs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d2b0888126d..a306bb6d88d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -109,7 +109,7 @@ struct afs_call { unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned short offset; /* offset into received data store */ + unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ -- cgit v1.2.3-70-g09d2 From c0173863528a8c9212c53e080d63a1aaae5ef4f4 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 16 Mar 2012 10:28:19 +0000 Subject: afs: Remote abort can cause BUG in rxrpc code When writing files to afs I sometimes hit a BUG: kernel BUG at fs/afs/rxrpc.c:179! With a backtrace of: afs_free_call afs_make_call afs_fs_store_data afs_vnode_store_data afs_write_back_from_locked_page afs_writepages_region afs_writepages The cause is: ASSERT(skb_queue_empty(&call->rx_queue)); Looking at a tcpdump of the session the abort happens because we are exceeding our disk quota: rx abort fs reply store-data error diskquota exceeded (32) So the abort error is valid. We hit the BUG because we haven't freed all the resources for the call. By freeing any skbs in call->rx_queue before calling afs_free_call we avoid hitting leaking memory and avoid hitting the BUG. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Cc: Signed-off-by: Linus Torvalds --- fs/afs/rxrpc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e45a323aebb..8ad8c2a0703 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; + struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); rxrpc_kernel_end_call(rxcall); call->rxcall = NULL; error_kill_call: -- cgit v1.2.3-70-g09d2 From 3d777a64066f3b9db8a94834aaed6a9cf09808fd Mon Sep 17 00:00:00 2001 From: Haogang Chen Date: Fri, 16 Mar 2012 17:08:38 -0700 Subject: nilfs2: clamp ns_r_segments_percentage to [1, 99] ns_r_segments_percentage is read from the disk. Bogus or malicious value could cause integer overflow and malfunction due to meaningless disk usage calculation. This patch reports error when mounting such bogus volumes. Signed-off-by: Haogang Chen Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/the_nilfs.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d3271409437..8a759016c2e 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -409,6 +409,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); nilfs->ns_r_segments_percentage = le32_to_cpu(sbp->s_r_segments_percentage); + if (nilfs->ns_r_segments_percentage < 1 || + nilfs->ns_r_segments_percentage > 99) { + printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); + return -EINVAL; + } + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; -- cgit v1.2.3-70-g09d2 From d7178c79d9b7c5518f9943188091a75fc6ce0675 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 16 Mar 2012 17:08:39 -0700 Subject: nilfs2: fix NULL pointer dereference in nilfs_load_super_block() According to the report from Slicky Devil, nilfs caused kernel oops at nilfs_load_super_block function during mount after he shrank the partition without resizing the filesystem: BUG: unable to handle kernel NULL pointer dereference at 00000048 IP: [] nilfs_load_super_block+0x17e/0x280 [nilfs2] *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP ... Call Trace: [] init_nilfs+0x4b/0x2e0 [nilfs2] [] nilfs_mount+0x447/0x5b0 [nilfs2] [] mount_fs+0x36/0x180 [] vfs_kern_mount+0x51/0xa0 [] do_kern_mount+0x3e/0xe0 [] do_mount+0x169/0x700 [] sys_mount+0x6b/0xa0 [] sysenter_do_call+0x12/0x28 Code: 53 18 8b 43 20 89 4b 18 8b 4b 24 89 53 1c 89 43 24 89 4b 20 8b 43 20 c7 43 2c 00 00 00 00 23 75 e8 8b 50 68 89 53 28 8b 54 b3 20 <8b> 72 48 8b 7a 4c 8b 55 08 89 b3 84 00 00 00 89 bb 88 00 00 00 EIP: [] nilfs_load_super_block+0x17e/0x280 [nilfs2] SS:ESP 0068:ca9bbdcc CR2: 0000000000000048 This turned out due to a defect in an error path which runs if the calculated location of the secondary super block was invalid. This patch fixes it and eliminates the reported oops. Reported-by: Slicky Devil Signed-off-by: Ryusuke Konishi Tested-by: Slicky Devil Cc: [2.6.30+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/the_nilfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8a759016c2e..501b7f8b739 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -521,6 +521,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, brelse(sbh[1]); sbh[1] = NULL; sbp[1] = NULL; + valid[1] = 0; swp = 0; } if (!valid[swp]) { -- cgit v1.2.3-70-g09d2 From 93dc6107a76daed81c07f50215fa6ae77691634f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 16 Mar 2012 16:34:03 -0400 Subject: Don't limit non-nested epoll paths Commit 28d82dc1c4ed ("epoll: limit paths") that I did to limit the number of possible wakeup paths in epoll is causing a few applications to longer work (dovecot for one). The original patch is really about limiting the amount of epoll nesting (since epoll fds can be attached to other fds). Thus, we probably can allow an unlimited number of paths of depth 1. My current patch limits it at 1000. And enforce the limits on paths that have a greater depth. This is captured in: https://bugzilla.redhat.com/show_bug.cgi?id=681578 Signed-off-by: Jason Baron Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ea54cdef04d..4d9d3a45e35 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -988,6 +988,10 @@ static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { + /* Allow an arbitrary number of depth 1 paths */ + if (nests == 0) + return 0; + if (++path_count[nests] > path_limits[nests]) return -1; return 0; -- cgit v1.2.3-70-g09d2 From 6d7d1a0dc735ea8412769edae7154885021107a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Mar 2012 16:19:53 -0700 Subject: vfs: get rid of batshit-insane pointless dentry hash calculations For some odd historical reason, the final mixing round for the dentry cache hash table lookup had an insane "xor with big constant" logic. In two places. The big constant that is being xor'ed is GOLDEN_RATIO_PRIME, which is a fairly random-looking number that is designed to be *multiplied* with so that the bits get spread out over a whole long-word. But xor'ing with it is insane. It doesn't really even change the hash - it really only shifts the hash around in the hash table. To make matters worse, the insane big constant is different on 32-bit and 64-bit builds, even though the name hash bits we use are always 32-bit (and the bits from the pointer we mix in effectively are too). It's all total voodoo programming, in other words. Now, some testing and analysis of the hash chains shows that the rest of the hash function seems to be fairly good. It does pick the right bits of the parent dentry pointer, for example, and while it's generally a bad idea to use an xor to mix down the upper bits (because if there is a repeating pattern, the xor can cause "destructive interference"), it seems to not have been a disaster. For example, replacing the hash with the normal "hash_long()" code (that uses the GOLDEN_RATIO_PRIME constant correctly, btw) actually just makes the hash worse. The hand-picked hash knew which bits of the pointer had the highest entropy, and hash_long() ends up mixing bits less optimally at least in some trivial tests. So the hash function overall seems fine, it just has that really odd "shift result around by a constant xor". So get rid of the silly xor, and replace the down-mixing of the bits with an add instead of an xor that tends to not have the same kind of destructive interference issues. Some stats on the resulting hash chains shows that they look statistically identical before and after, but the code is simpler and no longer makes you go "WTF?". Also, the incoming hash really is just "unsigned int", not a long, and there's no real point to worry about the high 26 bits of the dentry pointer for the 64-bit case, because they are all going to be identical anyway. So also change the hashing to be done in the more natural 'unsigned int' that is the real size of the actual hashed data anyway. Signed-off-by: Linus Torvalds --- fs/dcache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index bcbdb33fcc2..5f00a6f63c9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -105,10 +105,10 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; static inline struct hlist_bl_head *d_hash(const struct dentry *parent, - unsigned long hash) + unsigned int hash) { - hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; - hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); + hash += (unsigned long) parent / L1_CACHE_BYTES; + hash = hash + (hash >> D_HASHBITS); return dentry_hashtable + (hash & D_HASHMASK); } -- cgit v1.2.3-70-g09d2 From f1f996b66cc3908a8f5ffccc2ff41840e92f3b10 Mon Sep 17 00:00:00 2001 From: Laura Vasilescu Date: Mon, 19 Mar 2012 15:41:15 +0200 Subject: kcore: fix spelling in read_kcore() comment Signed-off-by: Laura Vasilescu Signed-off-by: Jiri Kosina --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d245cb23dd7..e5e69aff6c6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -513,7 +513,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) n = copy_to_user(buffer, (char *)start, tsz); /* - * We cannot distingush between fault on source + * We cannot distinguish between fault on source * and fault on destination. When this happens * we clear too and hope it will trigger the * EFAULT again. -- cgit v1.2.3-70-g09d2 From ad2a8e6078a16d3b61b530f1447110841c36ae56 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 20 Mar 2012 16:58:06 +0000 Subject: AFS: checking wrong bit in afs_readpages() We should be testing "if (vnode->flags & (1 << 4))" instead of "if (vnode->flags & 4) {". The current test checks if the data was modified instead of deleted. Signed-off-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 14d89fa58fe..8f6e9234d56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -251,7 +251,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); - if (vnode->flags & AFS_VNODE_DELETED) { + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { _leave(" = -ESTALE"); return -ESTALE; } -- cgit v1.2.3-70-g09d2 From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:22 +0100 Subject: exit_signal: simplify the "we have changed execution domain" logic exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id" to handle the "we have changed execution domain" case. We can change do_thread() to always set ->exit_signal = SIGCHLD and remove this check to simplify the code. We could change setup_new_exec() instead, this looks more logical because it increments ->self_exec_id. But note that de_thread() already resets ->exit_signal if it changes the leader, let's keep both changes close to each other. Note that we change ->exit_signal lockless, this changes the rules. Thereafter ->exit_signal is not stable under tasklist but this is fine, the only possible change is OLDSIG -> SIGCHLD. This can race with eligible_child() but the race is harmless. We can race with reparent_leader() which changes our ->exit_signal in parallel, but it does the same change to SIGCHLD. The noticeable user-visible change is that the execing task is not "visible" to do_wait()->eligible_child(__WCLONE) right after exec. To me this looks more logical, and this is consistent with mt case. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ kernel/exit.c | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index b0695a9900e..1e94d2263ae 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -977,6 +977,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + /* we have changed execution domain */ + tsk->exit_signal = SIGCHLD; + if (current->mm) setmax_mm_hiwater_rss(&sig->maxrss, current->mm); diff --git a/kernel/exit.c b/kernel/exit.c index 752d2c0abd1..51ac4ced131 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * If the parent exec id doesn't match the exec id we saved * when we started then we know the parent has changed security * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. */ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) + tsk->parent_exec_id != tsk->real_parent->self_exec_id) tsk->exit_signal = SIGCHLD; if (unlikely(tsk->ptrace)) { -- cgit v1.2.3-70-g09d2 From 701085b219016d38f105b031381b9cee6200253a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:04:01 +0100 Subject: exec: move de_thread()->setmax_mm_hiwater_rss() into exec_mmap() Minor cleanup. de_thread()->setmax_mm_hiwater_rss() looks a bit strange, move it into exec_mmap() which plays with old_mm. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1e94d2263ae..95551c6da09 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -850,6 +850,7 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; @@ -980,9 +981,6 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; - if (current->mm) - setmax_mm_hiwater_rss(&sig->maxrss, current->mm); - exit_itimers(sig); flush_itimer_signals(); -- cgit v1.2.3-70-g09d2