From 02125a826459a6ad142f8d91c5b6357562f96615 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Dec 2011 08:43:34 -0500 Subject: fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API __d_path() API is asking for trouble and in case of apparmor d_namespace_path() getting just that. The root cause is that when __d_path() misses the root it had been told to look for, it stores the location of the most remote ancestor in *root. Without grabbing references. Sure, at the moment of call it had been pinned down by what we have in *path. And if we raced with umount -l, we could have very well stopped at vfsmount/dentry that got freed as soon as prepend_path() dropped vfsmount_lock. It is safe to compare these pointers with pre-existing (and known to be still alive) vfsmount and dentry, as long as all we are asking is "is it the same address?". Dereferencing is not safe and apparmor ended up stepping into that. d_namespace_path() really wants to examine the place where we stopped, even if it's not connected to our namespace. As the result, it looked at ->d_sb->s_magic of a dentry that might've been already freed by that point. All other callers had been careful enough to avoid that, but it's really a bad interface - it invites that kind of trouble. The fix is fairly straightforward, even though it's bigger than I'd like: * prepend_path() root argument becomes const. * __d_path() is never called with NULL/NULL root. It was a kludge to start with. Instead, we have an explicit function - d_absolute_root(). Same as __d_path(), except that it doesn't get root passed and stops where it stops. apparmor and tomoyo are using it. * __d_path() returns NULL on path outside of root. The main caller is show_mountinfo() and that's precisely what we pass root for - to skip those outside chroot jail. Those who don't want that can (and do) use d_path(). * __d_path() root argument becomes const. Everyone agrees, I hope. * apparmor does *NOT* try to use __d_path() or any of its variants when it sees that path->mnt is an internal vfsmount. In that case it's definitely not mounted anywhere and dentry_path() is exactly what we want there. Handling of sysctl()-triggered weirdness is moved to that place. * if apparmor is asked to do pathname relative to chroot jail and __d_path() tells it we it's not in that jail, the sucker just calls d_absolute_path() instead. That's the other remaining caller of __d_path(), BTW. * seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway - the normal seq_file logics will take care of growing the buffer and redoing the call of ->show() just fine). However, if it gets path not reachable from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped ignoring the return value as it used to do). Reviewed-by: John Johansen ACKed-by: John Johansen Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- fs/dcache.c | 71 ++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 27 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 10ba92def3f..89509b5a090 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2439,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; @@ -2483,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2500,15 +2498,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2519,10 +2519,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2533,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2541,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2579,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2594,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2617,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2625,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2758,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; -- cgit v1.2.3-18-g5258 From b2dba1af3c4157040303a76d25216b1713d333d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Nov 2011 19:26:23 -0500 Subject: vfs: new internal helper: mnt_has_parent(mnt) vfsmounts have ->mnt_parent pointing either to a different vfsmount or to itself; it's never NULL and termination condition in loops traversing the tree towards root is mnt == mnt->mnt_parent. At least one place (see the next patch) is confused about what's going on; let's add an explicit helper checking it right way and use it in all places where we need it. Not that there had been too many, but... Signed-off-by: Al Viro --- fs/dcache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 89509b5a090..8a75e3b0f49 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -38,6 +38,7 @@ #include #include #include "internal.h" +#include "mount.h" /* * Usage: @@ -2460,9 +2461,8 @@ static int prepend_path(const struct path *path, if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (vfsmnt->mnt_parent == vfsmnt) { + if (!mnt_has_parent(vfsmnt)) goto global_root; - } dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; continue; @@ -2862,7 +2862,7 @@ int path_is_under(struct path *path1, struct path *path2) br_read_lock(vfsmount_lock); if (mnt != path2->mnt) { for (;;) { - if (mnt->mnt_parent == mnt) { + if (!mnt_has_parent(mnt)) { br_read_unlock(vfsmount_lock); return 0; } -- cgit v1.2.3-18-g5258 From afac7cba7ed31968a95e181dc25e204e45009ea8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Nov 2011 19:34:49 -0500 Subject: vfs: more mnt_parent cleanups a) mount --move is checking that ->mnt_parent is non-NULL before looking if that parent happens to be shared; ->mnt_parent is never NULL and it's not even an misspelled !mnt_has_parent() b) pivot_root open-codes is_path_reachable(), poorly. c) so does path_is_under(), while we are at it. Signed-off-by: Al Viro --- fs/dcache.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 8a75e3b0f49..64c8ce4c147 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2853,31 +2853,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -int path_is_under(struct path *path1, struct path *path2) -{ - struct vfsmount *mnt = path1->mnt; - struct dentry *dentry = path1->dentry; - int res; - - br_read_lock(vfsmount_lock); - if (mnt != path2->mnt) { - for (;;) { - if (!mnt_has_parent(mnt)) { - br_read_unlock(vfsmount_lock); - return 0; - } - if (mnt->mnt_parent == path2->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - res = is_subdir(dentry, path2->dentry); - br_read_unlock(vfsmount_lock); - return res; -} -EXPORT_SYMBOL(path_is_under); - void d_genocide(struct dentry *root) { struct dentry *this_parent; -- cgit v1.2.3-18-g5258 From 676da58df740f325034b8641311413c2393588e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 21:47:05 -0500 Subject: vfs: spread struct mount - mnt_has_parent Signed-off-by: Al Viro --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 64c8ce4c147..1834e715f81 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2460,8 +2460,9 @@ static int prepend_path(const struct path *path, struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { + struct mount *mnt = real_mount(vfsmnt); /* Global root? */ - if (!mnt_has_parent(vfsmnt)) + if (!mnt_has_parent(mnt)) goto global_root; dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; -- cgit v1.2.3-18-g5258 From 3376f34fff5be9954fd9a9c4fd68f4a0a36d480e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:05:19 -0500 Subject: vfs: mnt_parent moved to struct mount the second victim... Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 1834e715f81..eef2d5472f9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2465,7 +2465,7 @@ static int prepend_path(const struct path *path, if (!mnt_has_parent(mnt)) goto global_root; dentry = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; + vfsmnt = mnt->mnt_parent; continue; } parent = dentry->d_parent; -- cgit v1.2.3-18-g5258 From 0714a533805a0f8ebfc6fdb6bda9f129b8c7c6d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:19:58 -0500 Subject: vfs: now it can be done - make mnt_parent point to struct mount Signed-off-by: Al Viro --- fs/dcache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index eef2d5472f9..98b48753f77 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2452,6 +2452,7 @@ static int prepend_path(const struct path *path, { struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; + struct mount *mnt = real_mount(vfsmnt); bool slash = false; int error = 0; @@ -2460,12 +2461,12 @@ static int prepend_path(const struct path *path, struct dentry * parent; if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *mnt = real_mount(vfsmnt); /* Global root? */ if (!mnt_has_parent(mnt)) goto global_root; - dentry = vfsmnt->mnt_mountpoint; - vfsmnt = mnt->mnt_parent; + dentry = mnt->mnt.mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; continue; } parent = dentry->d_parent; -- cgit v1.2.3-18-g5258 From a73324da7af4052e1d1ddec6a5980f552420e58b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Nov 2011 22:25:07 -0500 Subject: vfs: move mnt_mountpoint to struct mount Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 98b48753f77..24790041ea7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2464,7 +2464,7 @@ static int prepend_path(const struct path *path, /* Global root? */ if (!mnt_has_parent(mnt)) goto global_root; - dentry = mnt->mnt.mnt_mountpoint; + dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; vfsmnt = &mnt->mnt; continue; -- cgit v1.2.3-18-g5258 From 143c8c91cee7efdd732ec5f61b3471fc46192f20 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Nov 2011 00:46:35 -0500 Subject: vfs: mnt_ns moved to struct mount Signed-off-by: Al Viro --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 24790041ea7..9791b1e7eee 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2503,7 +2503,7 @@ global_root: if (!slash) error = prepend(buffer, buflen, "/", 1); if (!error) - error = vfsmnt->mnt_ns ? 1 : 2; + error = real_mount(vfsmnt)->mnt_ns ? 1 : 2; goto out; } -- cgit v1.2.3-18-g5258 From b48f03b319ba78f3abf9a7044d1f436d8d90f4f9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 23 Aug 2011 18:56:24 +1000 Subject: dcache: use a dispose list in select_parent select_parent currently abuses the dentry cache LRU to provide cleanup features for child dentries that need to be freed. It moves them to the tail of the LRU, then tells shrink_dcache_parent() to calls __shrink_dcache_sb to unconditionally move them to a dispose list (as DCACHE_REFERENCED is ignored). __shrink_dcache_sb() has to relock the dentries to move them off the LRU onto the dispose list, but otherwise does not touch the dentries that select_parent() moved to the tail of the LRU. It then passses the dispose list to shrink_dentry_list() which tries to free the dentries. IOWs, the use of __shrink_dcache_sb() is superfluous - we can build exactly the same list of dentries for disposal directly in select_parent() and call shrink_dentry_list() instead of calling __shrink_dcache_sb() to do that. This means that we avoid long holds on the lru lock walking the LRU moving dentries to the dispose list We also avoid the need to relock each dentry just to move it off the LRU, reducing the numebr of times we lock each dentry to dispose of them in shrink_dcache_parent() from 3 to 2 times. Further, we remove one of the two callers of __shrink_dcache_sb(). This also means that __shrink_dcache_sb can be moved into back into prune_dcache_sb() and we no longer have to handle referenced dentries conditionally, simplifying the code. Signed-off-by: Dave Chinner Signed-off-by: Linus Torvalds Signed-off-by: Al Viro --- fs/dcache.c | 63 +++++++++++++++++++++---------------------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 9791b1e7eee..b209d73f9a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -276,15 +276,15 @@ static void dentry_lru_prune(struct dentry *dentry) } } -static void dentry_lru_move_tail(struct dentry *dentry) +static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; } else { - list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_move_tail(&dentry->d_lru, list); } spin_unlock(&dcache_lru_lock); } @@ -770,14 +770,18 @@ static void shrink_dentry_list(struct list_head *list) } /** - * __shrink_dcache_sb - shrink the dentry LRU on a given superblock - * @sb: superblock to shrink dentry LRU. - * @count: number of entries to prune - * @flags: flags to control the dentry processing + * prune_dcache_sb - shrink the dcache + * @sb: superblock + * @count: number of entries to try to free + * + * Attempt to shrink the superblock dcache LRU by @count entries. This is + * done when we need more memory an called from the superblock shrinker + * function. * - * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + * This function may fail to free any resources if all the dentries are in + * use. */ -static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) +void prune_dcache_sb(struct super_block *sb, int count) { struct dentry *dentry; LIST_HEAD(referenced); @@ -796,13 +800,7 @@ relock: goto relock; } - /* - * If we are honouring the DCACHE_REFERENCED flag and the - * dentry has this flag set, don't free it. Clear the flag - * and put it back on the LRU. - */ - if (flags & DCACHE_REFERENCED && - dentry->d_flags & DCACHE_REFERENCED) { + if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); @@ -821,23 +819,6 @@ relock: shrink_dentry_list(&tmp); } -/** - * prune_dcache_sb - shrink the dcache - * @sb: superblock - * @nr_to_scan: number of entries to try to free - * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker - * function. - * - * This function may fail to free any resources if all the dentries are in - * use. - */ -void prune_dcache_sb(struct super_block *sb, int nr_to_scan) -{ - __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED); -} - /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -1092,7 +1073,7 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry * parent) +static int select_parent(struct dentry *parent, struct list_head *dispose) { struct dentry *this_parent; struct list_head *next; @@ -1114,12 +1095,11 @@ resume: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache + /* + * move only zero ref count dentries to the dispose list. */ if (!dentry->d_count) { - dentry_lru_move_tail(dentry); + dentry_lru_move_list(dentry, dispose); found++; } else { dentry_lru_del(dentry); @@ -1181,14 +1161,13 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ - void shrink_dcache_parent(struct dentry * parent) { - struct super_block *sb = parent->d_sb; + LIST_HEAD(dispose); int found; - while ((found = select_parent(parent)) != 0) - __shrink_dcache_sb(sb, found, 0); + while ((found = select_parent(parent, &dispose)) != 0) + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(shrink_dcache_parent); -- cgit v1.2.3-18-g5258 From adc0e91ab142abe93f5b0d7980ada8a7676231fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 16:49:21 -0500 Subject: vfs: new helper - d_make_root() d_alloc_root() with iput() in case of allocation failure... Signed-off-by: Al Viro --- fs/dcache.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index b209d73f9a9..3c6d3113a25 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1440,6 +1440,23 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +struct dentry *d_make_root(struct inode *root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + static const struct qstr name = { .name = "/", .len = 1 }; + + res = __d_alloc(root_inode->i_sb, &name); + if (res) + d_instantiate(res, root_inode); + else + iput(root_inode); + } + return res; +} +EXPORT_SYMBOL(d_make_root); + static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; -- cgit v1.2.3-18-g5258 From eaf5f9073533cde21c7121c136f1c3f072d9cf59 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Jan 2012 18:22:25 +0100 Subject: fix shrink_dcache_parent() livelock Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may cause shrink_dcache_parent() to loop forever. Here's what appears to happen: 1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1 2 - CPU1: select_parent(P) locks P->d_lock 3 - CPU0: shrink_dentry_list() locks C->d_lock dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock 4 - CPU1: select_parent(P) locks C->d_lock, moves C from dispose list being processed on CPU0 to the new dispose list, returns 1 5 - CPU0: shrink_dentry_list() finds dispose list empty, returns 6 - Goto 2 with CPU0 and CPU1 switched Basically select_parent() steals the dentry from shrink_dentry_list() and thinks it found a new one, causing shrink_dentry_list() to think it's making progress and loop over and over. One way to trigger this is to make udev calls stat() on the sysfs file while it is going away. Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick: ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true" Then execute the following loop: while true; do echo -bond0 > /sys/class/net/bonding_masters echo +bond0 > /sys/class/net/bonding_masters echo -bond1 > /sys/class/net/bonding_masters echo +bond1 > /sys/class/net/bonding_masters done One fix would be to check all callers and prevent concurrent calls to shrink_dcache_parent(). But I think a better solution is to stop the stealing behavior. This patch adds a new dentry flag that is set when the dentry is added to the dispose list. The flag is cleared in dentry_lru_del() in case the dentry gets a new reference just before being pruned. If the dentry has this flag, select_parent() will skip it and let shrink_dentry_list() retry pruning it. With select_parent() skipping those dentries there will not be the appearance of progress (new dentries found) when there is none, hence shrink_dcache_parent() will not loop forever. Set the flag is also set in prune_dcache_sb() for consistency as suggested by Linus. Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 3c6d3113a25..616fedff011 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -806,6 +807,7 @@ relock: spin_unlock(&dentry->d_lock); } else { list_move_tail(&dentry->d_lru, &tmp); + dentry->d_flags |= DCACHE_SHRINK_LIST; spin_unlock(&dentry->d_lock); if (!--count) break; @@ -1097,14 +1099,19 @@ resume: /* * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (!dentry->d_count) { + if (dentry->d_count) { + dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_list(dentry, dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; found++; - } else { - dentry_lru_del(dentry); } - /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find -- cgit v1.2.3-18-g5258 From 46f72b349290d2bd7aecea38f02609d814332df6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 10 Jan 2012 09:04:37 -0800 Subject: vfs: export symbol d_find_any_alias() Ceph needs this. Reviewed-by: Christoph Hellwig Signed-off-by: Sage Weil --- fs/dcache.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 89509b5a090..ba960051dfb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1471,7 +1471,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode) return alias; } -static struct dentry * d_find_any_alias(struct inode *inode) +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; @@ -1480,7 +1487,7 @@ static struct dentry * d_find_any_alias(struct inode *inode) spin_unlock(&inode->i_lock); return de; } - +EXPORT_SYMBOL(d_find_any_alias); /** * d_obtain_alias - find or allocate a dentry for a given inode -- cgit v1.2.3-18-g5258 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc0..fe19ac13f75 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } -- cgit v1.2.3-18-g5258 From 8966be90304b394fd6a2c5af7b6b3abe2df3889c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:23:30 -0800 Subject: vfs: trivial __d_lookup_rcu() cleanups These don't change any semantics, but they clean up the code a bit and mark some arguments appropriately 'const'. They came up as I was doing the word-at-a-time dcache name accessor code, and cleaning this up now allows me to send out a smaller relevant interesting patch for the experimental stuff. Signed-off-by: Linus Torvalds --- fs/dcache.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs/dcache.c') diff --git a/fs/dcache.c b/fs/dcache.c index fe19ac13f75..138be96e25b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -104,7 +104,7 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(struct dentry *parent, +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned long hash) { hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; @@ -1717,8 +1717,9 @@ EXPORT_SYMBOL(d_add_ci); * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. */ -struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, - unsigned *seq, struct inode **inode) +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp, struct inode **inode) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -1748,6 +1749,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; struct inode *i; const char *tname; int tlen; @@ -1756,7 +1758,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, continue; seqretry: - *seq = read_seqcount_begin(&dentry->d_seq); + seq = read_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) @@ -1771,7 +1773,7 @@ seqretry: * edge of memory when walking. If we could load this * atomically some other way, we could drop this check. */ - if (read_seqcount_retry(&dentry->d_seq, *seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto seqretry; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, @@ -1788,6 +1790,7 @@ seqretry: * order to do anything useful with the returned dentry * anyway. */ + *seqp = seq; *inode = i; return dentry; } -- cgit v1.2.3-18-g5258