diff options
Diffstat (limited to 'fs')
210 files changed, 5257 insertions, 3243 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 02a2cf61631..51545529637 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -21,8 +21,8 @@ #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" -#include "v9fs_vfs.h" #include "v9fs.h" +#include "v9fs_vfs.h" static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) { @@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); return 0; @@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) if (!IS_ERR(dacl) && !IS_ERR(pacl)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); - posix_acl_release(dacl); - posix_acl_release(pacl); } else retval = -EIO; + if (!IS_ERR(dacl)) + posix_acl_release(dacl); + + if (!IS_ERR(pacl)) + posix_acl_release(pacl); + return retval; } @@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) return -ECHILD; v9ses = v9fs_inode2v9ses(inode); - if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { + if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || + ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { /* - * On access = client mode get the acl + * On access = client and acl = on mode get the acl * values from the server */ return 0; @@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl) struct inode *inode = dentry->d_inode; set_cached_acl(inode, type, acl); + + if (!acl) + return 0; + /* Set a setxattr request to server */ size = posix_acl_xattr_size(acl->a_count); buffer = kmalloc(size, GFP_KERNEL); @@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry) int v9fs_set_create_acl(struct dentry *dentry, struct posix_acl *dpacl, struct posix_acl *pacl) { - if (dpacl) - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); - if (pacl) - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); posix_acl_release(dpacl); posix_acl_release(pacl); return 0; diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 0dbe0d139ac..5b335c5086a 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -33,67 +33,11 @@ #define CACHETAG_LEN 11 -struct kmem_cache *vcookie_cache; - struct fscache_netfs v9fs_cache_netfs = { .name = "9p", .version = 0, }; -static void init_once(void *foo) -{ - struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo; - vcookie->fscache = NULL; - vcookie->qid = NULL; - inode_init_once(&vcookie->inode); -} - -/** - * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain - * vcookie to inode mapping - * - * Returns 0 on success. - */ - -static int v9fs_init_vcookiecache(void) -{ - vcookie_cache = kmem_cache_create("vcookie_cache", - sizeof(struct v9fs_cookie), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (!vcookie_cache) - return -ENOMEM; - - return 0; -} - -/** - * v9fs_destroy_vcookiecache - destroy the cache of vcookies - * - */ - -static void v9fs_destroy_vcookiecache(void) -{ - kmem_cache_destroy(vcookie_cache); -} - -int __v9fs_cache_register(void) -{ - int ret; - ret = v9fs_init_vcookiecache(); - if (ret < 0) - return ret; - - return fscache_register_netfs(&v9fs_cache_netfs); -} - -void __v9fs_cache_unregister(void) -{ - v9fs_destroy_vcookiecache(); - fscache_unregister_netfs(&v9fs_cache_netfs); -} - /** * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. @@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, } const struct fscache_cookie_def v9fs_cache_session_index_def = { - .name = "9P.session", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = v9fs_cache_session_get_key, + .name = "9P.session", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = v9fs_cache_session_get_key, }; void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) @@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, void *buffer, uint16_t bufmax) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path)); - - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode, - vcookie->qid->path); - return sizeof(vcookie->qid->path); + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->fscache_key->path, + sizeof(v9inode->fscache_key->path)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, + v9inode->fscache_key->path); + return sizeof(v9inode->fscache_key->path); } static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - *size = i_size_read(&vcookie->inode); + const struct v9fs_inode *v9inode = cookie_netfs_data; + *size = i_size_read(&v9inode->vfs_inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode, + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, *size); } static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, void *buffer, uint16_t buflen) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; - memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version)); - - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode, - vcookie->qid->version); - return sizeof(vcookie->qid->version); + const struct v9fs_inode *v9inode = cookie_netfs_data; + memcpy(buffer, &v9inode->fscache_key->version, + sizeof(v9inode->fscache_key->version)); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, + v9inode->fscache_key->version); + return sizeof(v9inode->fscache_key->version); } static enum @@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen) { - const struct v9fs_cookie *vcookie = cookie_netfs_data; + const struct v9fs_inode *v9inode = cookie_netfs_data; - if (buflen != sizeof(vcookie->qid->version)) + if (buflen != sizeof(v9inode->fscache_key->version)) return FSCACHE_CHECKAUX_OBSOLETE; - if (memcmp(buffer, &vcookie->qid->version, - sizeof(vcookie->qid->version))) + if (memcmp(buffer, &v9inode->fscache_key->version, + sizeof(v9inode->fscache_key->version))) return FSCACHE_CHECKAUX_OBSOLETE; return FSCACHE_CHECKAUX_OKAY; @@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) { - struct v9fs_cookie *vcookie = cookie_netfs_data; + struct v9fs_inode *v9inode = cookie_netfs_data; struct pagevec pvec; pgoff_t first; int loop, nr_pages; @@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) first = 0; for (;;) { - nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping, + nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, first, PAGEVEC_SIZE - pagevec_count(&pvec)); if (!nr_pages) @@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { void v9fs_cache_inode_get_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie; + struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; if (!S_ISREG(inode->i_mode)) return; - vcookie = v9fs_inode2cookie(inode); - if (vcookie->fscache) + v9inode = V9FS_I(inode); + if (v9inode->fscache) return; v9ses = v9fs_inode2v9ses(inode); - vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - vcookie); + v9inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, - vcookie->fscache); + v9inode->fscache); } void v9fs_cache_inode_put_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - if (!vcookie->fscache) + if (!v9inode->fscache) return; P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, - vcookie->fscache); + v9inode->fscache); - fscache_relinquish_cookie(vcookie->fscache, 0); - vcookie->fscache = NULL; + fscache_relinquish_cookie(v9inode->fscache, 0); + v9inode->fscache = NULL; } void v9fs_cache_inode_flush_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - if (!vcookie->fscache) + if (!v9inode->fscache) return; P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, - vcookie->fscache); + v9inode->fscache); - fscache_relinquish_cookie(vcookie->fscache, 1); - vcookie->fscache = NULL; + fscache_relinquish_cookie(v9inode->fscache, 1); + v9inode->fscache = NULL; } void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; - if (!vcookie->fscache) + if (!v9inode->fscache) return; - spin_lock(&vcookie->lock); + spin_lock(&v9inode->fscache_lock); fid = filp->private_data; if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else v9fs_cache_inode_get_cookie(inode); - spin_unlock(&vcookie->lock); + spin_unlock(&v9inode->fscache_lock); } void v9fs_cache_inode_reset_cookie(struct inode *inode) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct fscache_cookie *old; - if (!vcookie->fscache) + if (!v9inode->fscache) return; - old = vcookie->fscache; + old = v9inode->fscache; - spin_lock(&vcookie->lock); - fscache_relinquish_cookie(vcookie->fscache, 1); + spin_lock(&v9inode->fscache_lock); + fscache_relinquish_cookie(v9inode->fscache, 1); v9ses = v9fs_inode2v9ses(inode); - vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, + v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - vcookie); - + v9inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", - inode, old, vcookie->fscache); + inode, old, v9inode->fscache); - spin_unlock(&vcookie->lock); + spin_unlock(&v9inode->fscache_lock); } int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) { struct inode *inode = page->mapping->host; - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - BUG_ON(!vcookie->fscache); + BUG_ON(!v9inode->fscache); - return fscache_maybe_release_page(vcookie->fscache, page, gfp); + return fscache_maybe_release_page(v9inode->fscache, page, gfp); } void __v9fs_fscache_invalidate_page(struct page *page) { struct inode *inode = page->mapping->host; - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + struct v9fs_inode *v9inode = V9FS_I(inode); - BUG_ON(!vcookie->fscache); + BUG_ON(!v9inode->fscache); if (PageFsCache(page)) { - fscache_wait_on_page_write(vcookie->fscache, page); + fscache_wait_on_page_write(v9inode->fscache, page); BUG_ON(!PageLocked(page)); - fscache_uncache_page(vcookie->fscache, page); + fscache_uncache_page(v9inode->fscache, page); } } @@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data, int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); - if (!vcookie->fscache) + if (!v9inode->fscache) return -ENOBUFS; - ret = fscache_read_or_alloc_page(vcookie->fscache, + ret = fscache_read_or_alloc_page(v9inode->fscache, page, v9fs_vfs_readpage_complete, NULL, @@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode, unsigned *nr_pages) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); - if (!vcookie->fscache) + if (!v9inode->fscache) return -ENOBUFS; - ret = fscache_read_or_alloc_pages(vcookie->fscache, + ret = fscache_read_or_alloc_pages(v9inode->fscache, mapping, pages, nr_pages, v9fs_vfs_readpage_complete, NULL, @@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode, void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); + const struct v9fs_inode *v9inode = V9FS_I(inode); P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); - ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL); + ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); if (ret != 0) v9fs_uncache_page(inode, page); } + +/* + * wait for a page to complete writing to the cache + */ +void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +{ + const struct v9fs_inode *v9inode = V9FS_I(inode); + P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + if (PageFsCache(page)) + fscache_wait_on_page_write(v9inode->fscache, page); +} diff --git a/fs/9p/cache.h b/fs/9p/cache.h index a94192bfaee..049507a5b01 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -25,20 +25,6 @@ #include <linux/fscache.h> #include <linux/spinlock.h> -extern struct kmem_cache *vcookie_cache; - -struct v9fs_cookie { - spinlock_t lock; - struct inode inode; - struct fscache_cookie *fscache; - struct p9_qid *qid; -}; - -static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode) -{ - return container_of(inode, struct v9fs_cookie, inode); -} - extern struct fscache_netfs v9fs_cache_netfs; extern const struct fscache_cookie_def v9fs_cache_session_index_def; extern const struct fscache_cookie_def v9fs_cache_inode_index_def; @@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode, struct list_head *pages, unsigned *nr_pages); extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); - - -/** - * v9fs_cache_register - Register v9fs file system with the cache - */ -static inline int v9fs_cache_register(void) -{ - return __v9fs_cache_register(); -} - -/** - * v9fs_cache_unregister - Unregister v9fs from the cache - */ -static inline void v9fs_cache_unregister(void) -{ - __v9fs_cache_unregister(); -} +extern void __v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page); static inline int v9fs_fscache_release_page(struct page *page, gfp_t gfp) @@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode, static inline void v9fs_uncache_page(struct inode *inode, struct page *page) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); - fscache_uncache_page(vcookie->fscache, page); + struct v9fs_inode *v9inode = V9FS_I(inode); + fscache_uncache_page(v9inode->fscache, page); BUG_ON(PageFsCache(page)); } -static inline void v9fs_vcookie_set_qid(struct inode *inode, +static inline void v9fs_fscache_set_key(struct inode *inode, struct p9_qid *qid) { - struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); - spin_lock(&vcookie->lock); - vcookie->qid = qid; - spin_unlock(&vcookie->lock); + struct v9fs_inode *v9inode = V9FS_I(inode); + spin_lock(&v9inode->fscache_lock); + v9inode->fscache_key = qid; + spin_unlock(&v9inode->fscache_lock); } -#else /* CONFIG_9P_FSCACHE */ - -static inline int v9fs_cache_register(void) +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) { - return 1; + return __v9fs_fscache_wait_on_page_write(inode, page); } -static inline void v9fs_cache_unregister(void) {} +#else /* CONFIG_9P_FSCACHE */ static inline int v9fs_fscache_release_page(struct page *page, gfp_t gfp) { @@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode, static inline void v9fs_uncache_page(struct inode *inode, struct page *page) {} -static inline void v9fs_vcookie_set_qid(struct inode *inode, - struct p9_qid *qid) -{} +static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + return; +} #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index b00223c99d7..cd63e002d82 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -125,46 +125,17 @@ err_out: return -ENOMEM; } -/** - * v9fs_fid_lookup - lookup for a fid, try to walk if not found - * @dentry: dentry to look for fid in - * - * Look for a fid in the specified dentry for the current user. - * If no fid is found, try to create one walking from a fid from the parent - * dentry (if it has one), or the root dentry. If the user haven't accessed - * the fs yet, attach now and walk from the root. - */ - -struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, + uid_t uid, int any) { - int i, n, l, clone, any, access; - u32 uid; - struct p9_fid *fid, *old_fid = NULL; struct dentry *ds; - struct v9fs_session_info *v9ses; char **wnames, *uname; + int i, n, l, clone, access; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *old_fid = NULL; v9ses = v9fs_inode2v9ses(dentry->d_inode); access = v9ses->flags & V9FS_ACCESS_MASK; - switch (access) { - case V9FS_ACCESS_SINGLE: - case V9FS_ACCESS_USER: - case V9FS_ACCESS_CLIENT: - uid = current_fsuid(); - any = 0; - break; - - case V9FS_ACCESS_ANY: - uid = v9ses->uid; - any = 1; - break; - - default: - uid = ~0; - any = 0; - break; - } - fid = v9fs_fid_find(dentry, uid, any); if (fid) return fid; @@ -250,6 +221,45 @@ err_out: return fid; } +/** + * v9fs_fid_lookup - lookup for a fid, try to walk if not found + * @dentry: dentry to look for fid in + * + * Look for a fid in the specified dentry for the current user. + * If no fid is found, try to create one walking from a fid from the parent + * dentry (if it has one), or the root dentry. If the user haven't accessed + * the fs yet, attach now and walk from the root. + */ + +struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) +{ + uid_t uid; + int any, access; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + access = v9ses->flags & V9FS_ACCESS_MASK; + switch (access) { + case V9FS_ACCESS_SINGLE: + case V9FS_ACCESS_USER: + case V9FS_ACCESS_CLIENT: + uid = current_fsuid(); + any = 0; + break; + + case V9FS_ACCESS_ANY: + uid = v9ses->uid; + any = 1; + break; + + default: + uid = ~0; + any = 0; + break; + } + return v9fs_fid_lookup_with_uid(dentry, uid, any); +} + struct p9_fid *v9fs_fid_clone(struct dentry *dentry) { struct p9_fid *fid, *ret; @@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry) ret = p9_client_walk(fid, 0, NULL, 1); return ret; } + +static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid) +{ + struct p9_fid *fid, *ret; + + fid = v9fs_fid_lookup_with_uid(dentry, uid, 0); + if (IS_ERR(fid)) + return fid; + + ret = p9_client_walk(fid, 0, NULL, 1); + return ret; +} + +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) +{ + int err; + struct p9_fid *fid; + + fid = v9fs_fid_clone_with_uid(dentry, 0); + if (IS_ERR(fid)) + goto error_out; + /* + * writeback fid will only be used to write back the + * dirty pages. We always request for the open fid in read-write + * mode so that a partial page write which result in page + * read can work. + */ + err = p9_client_open(fid, O_RDWR); + if (err < 0) { + p9_client_clunk(fid); + fid = ERR_PTR(err); + goto error_out; + } +error_out: + return fid; +} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index c3bbd6af996..bb0b6e7f58f 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -19,7 +19,8 @@ * Boston, MA 02111-1301 USA * */ - +#ifndef FS_9P_FID_H +#define FS_9P_FID_H #include <linux/list.h> /** @@ -45,3 +46,5 @@ struct v9fs_dentry { struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); struct p9_fid *v9fs_fid_clone(struct dentry *dentry); int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); +struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); +#endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 2f77cd33ba8..c82b017f51f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -39,6 +39,7 @@ static DEFINE_SPINLOCK(v9fs_sessionlist_lock); static LIST_HEAD(v9fs_sessionlist); +struct kmem_cache *v9fs_inode_cache; /* * Option Parsing (code inspired by NFS code) @@ -55,7 +56,7 @@ enum { /* Cache options */ Opt_cache_loose, Opt_fscache, /* Access options */ - Opt_access, + Opt_access, Opt_posixacl, /* Error token */ Opt_err }; @@ -73,6 +74,7 @@ static const match_table_t tokens = { {Opt_fscache, "fscache"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, + {Opt_posixacl, "posixacl"}, {Opt_err, NULL} }; @@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) else if (strcmp(s, "any") == 0) v9ses->flags |= V9FS_ACCESS_ANY; else if (strcmp(s, "client") == 0) { -#ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_ACCESS_CLIENT; -#else - P9_DPRINTK(P9_DEBUG_ERROR, - "access=client option not supported\n"); - kfree(s); - ret = -EINVAL; - goto free_and_return; -#endif } else { v9ses->flags |= V9FS_ACCESS_SINGLE; v9ses->uid = simple_strtoul(s, &e, 10); @@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) kfree(s); break; + case Opt_posixacl: +#ifdef CONFIG_9P_FS_POSIX_ACL + v9ses->flags |= V9FS_POSIX_ACL; +#else + P9_DPRINTK(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. " + "Ignoring posixacl option\n"); +#endif + break; + default: continue; } @@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->flags = V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - rc = v9fs_parse_options(v9ses, data); - if (rc < 0) { - retval = rc; - goto error; - } - v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); @@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto error; } - if (p9_is_proto_dotl(v9ses->clnt)) + v9ses->flags = V9FS_ACCESS_USER; + + if (p9_is_proto_dotl(v9ses->clnt)) { + v9ses->flags = V9FS_ACCESS_CLIENT; v9ses->flags |= V9FS_PROTO_2000L; - else if (p9_is_proto_dotu(v9ses->clnt)) + } else if (p9_is_proto_dotu(v9ses->clnt)) { v9ses->flags |= V9FS_PROTO_2000U; + } + + rc = v9fs_parse_options(v9ses, data); + if (rc < 0) { + retval = rc; + goto error; + } v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; @@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags |= V9FS_ACCESS_ANY; v9ses->uid = ~0; } + if (!v9fs_proto_dotl(v9ses) || + !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { + /* + * We support ACL checks on clinet only if the protocol is + * 9P2000.L and access is V9FS_ACCESS_CLIENT. + */ + v9ses->flags &= ~V9FS_ACL_MASK; + } fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, v9ses->aname); @@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void) kobject_put(v9fs_kobj); } +static void v9fs_inode_init_once(void *foo) +{ + struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + v9inode->fscache_key = NULL; +#endif + inode_init_once(&v9inode->vfs_inode); +} + +/** + * v9fs_init_inode_cache - initialize a cache for 9P + * Returns 0 on success. + */ +static int v9fs_init_inode_cache(void) +{ + v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", + sizeof(struct v9fs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + v9fs_inode_init_once); + if (!v9fs_inode_cache) + return -ENOMEM; + + return 0; +} + +/** + * v9fs_destroy_inode_cache - destroy the cache of 9P inode + * + */ +static void v9fs_destroy_inode_cache(void) +{ + kmem_cache_destroy(v9fs_inode_cache); +} + +static int v9fs_cache_register(void) +{ + int ret; + ret = v9fs_init_inode_cache(); + if (ret < 0) + return ret; +#ifdef CONFIG_9P_FSCACHE + return fscache_register_netfs(&v9fs_cache_netfs); +#else + return ret; +#endif +} + +static void v9fs_cache_unregister(void) +{ + v9fs_destroy_inode_cache(); +#ifdef CONFIG_9P_FSCACHE + fscache_unregister_netfs(&v9fs_cache_netfs); +#endif +} + /** * init_v9fs - Initialize module * diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index c4b5d8864f0..bd8496db135 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -20,6 +20,9 @@ * Boston, MA 02111-1301 USA * */ +#ifndef FS_9P_V9FS_H +#define FS_9P_V9FS_H + #include <linux/backing-dev.h> /** @@ -28,8 +31,10 @@ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) + * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options + * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ @@ -37,13 +42,15 @@ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY +#define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, - V9FS_ACCESS_CLIENT = 0x10 + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20 }; /* possible values of ->cache */ @@ -109,8 +116,28 @@ struct v9fs_session_info { struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; struct rw_semaphore rename_sem; + struct p9_fid *root_fid; /* Used for file system sync */ +}; + +/* cache_validity flags */ +#define V9FS_INO_INVALID_ATTR 0x01 + +struct v9fs_inode { +#ifdef CONFIG_9P_FSCACHE + spinlock_t fscache_lock; + struct fscache_cookie *fscache; + struct p9_qid *fscache_key; +#endif + unsigned int cache_validity; + struct p9_fid *writeback_fid; + struct inode vfs_inode; }; +static inline struct v9fs_inode *V9FS_I(const struct inode *inode) +{ + return container_of(inode, struct v9fs_inode, vfs_inode); +} + struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); extern void v9fs_session_close(struct v9fs_session_info *v9ses); @@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p); -extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb); - +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; -extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb); +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb); /* other default globals */ #define V9FS_PORT 564 @@ -158,7 +184,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) } /** - * v9fs_inode_from_fid - Helper routine to populate an inode by + * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for @@ -166,11 +192,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) * */ static inline struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_dotl(v9ses, fid, sb); + return v9fs_inode_from_fid_dotl(v9ses, fid, sb); else - return v9fs_inode(v9ses, fid, sb); + return v9fs_inode_from_fid(v9ses, fid, sb); } +#endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index b789f8e597e..4014160903a 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -20,6 +20,8 @@ * Boston, MA 02111-1301 USA * */ +#ifndef FS_9P_V9FS_VFS_H +#define FS_9P_V9FS_VFS_H /* plan9 semantics are that created files are implicitly opened. * But linux semantics are that you call create, then open. @@ -36,6 +38,7 @@ * unlink calls remove, which is an implicit clunk. So we have to track * that kind of thing so that we don't try to clunk a dead fid. */ +#define P9_LOCK_TIMEOUT (30*HZ) extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; @@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations; extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; +extern const struct file_operations v9fs_cached_file_operations; +extern const struct file_operations v9fs_cached_file_operations_dotl; +extern struct kmem_cache *v9fs_inode_cache; -#ifdef CONFIG_9P_FSCACHE struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -#endif - struct inode *v9fs_get_inode(struct super_block *sb, int mode); +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); @@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); +ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64); void v9fs_blank_wstat(struct p9_wstat *wstat); int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); int v9fs_file_fsync_dotl(struct file *filp, int datasync); - -#define P9_LOCK_TIMEOUT (30*HZ) +ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *, + const char __user *, size_t, loff_t *, int); +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); +static inline void v9fs_invalidate_inode_attr(struct inode *inode) +{ + struct v9fs_inode *v9inode; + v9inode = V9FS_I(inode); + v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; + return; +} +#endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index b7f2a8e3863..2524e4cbb8e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -39,16 +39,16 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" +#include "fid.h" /** - * v9fs_vfs_readpage - read an entire page in from 9P + * v9fs_fid_readpage - read an entire page in from 9P * - * @filp: file being read + * @fid: fid being read * @page: structure to page * */ - -static int v9fs_vfs_readpage(struct file *filp, struct page *page) +static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) { int retval; loff_t offset; @@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) buffer = kmap(page); offset = page_offset(page); - retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); + retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset); if (retval < 0) { v9fs_uncache_page(inode, page); goto done; @@ -87,6 +87,19 @@ done: } /** + * v9fs_vfs_readpage - read an entire page in from 9P + * + * @filp: file being read + * @page: structure to page + * + */ + +static int v9fs_vfs_readpage(struct file *filp, struct page *page) +{ + return v9fs_fid_readpage(filp->private_data, page); +} + +/** * v9fs_vfs_readpages - read a set of pages from 9P * * @filp: file being read @@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - return v9fs_fscache_release_page(page, gfp); } @@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) static void v9fs_invalidate_page(struct page *page, unsigned long offset) { + /* + * If called with zero offset, we should release + * the private state assocated with the page + */ if (offset == 0) v9fs_fscache_invalidate_page(page); } +static int v9fs_vfs_writepage_locked(struct page *page) +{ + char *buffer; + int retval, len; + loff_t offset, size; + mm_segment_t old_fs; + struct v9fs_inode *v9inode; + struct inode *inode = page->mapping->host; + + v9inode = V9FS_I(inode); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + set_page_writeback(page); + + buffer = kmap(page); + offset = page_offset(page); + + old_fs = get_fs(); + set_fs(get_ds()); + /* We should have writeback_fid always set */ + BUG_ON(!v9inode->writeback_fid); + + retval = v9fs_file_write_internal(inode, + v9inode->writeback_fid, + (__force const char __user *)buffer, + len, &offset, 0); + if (retval > 0) + retval = 0; + + set_fs(old_fs); + kunmap(page); + end_page_writeback(page); + return retval; +} + +static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) +{ + int retval; + + retval = v9fs_vfs_writepage_locked(page); + if (retval < 0) { + if (retval == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + retval = 0; + } else { + SetPageError(page); + mapping_set_error(page->mapping, retval); + } + } else + retval = 0; + + unlock_page(page); + return retval; +} + /** * v9fs_launder_page - Writeback a dirty page - * Since the writes go directly to the server, we simply return a 0 - * here to indicate success. - * * Returns 0 on success. */ static int v9fs_launder_page(struct page *page) { + int retval; + struct inode *inode = page->mapping->host; + + v9fs_fscache_wait_on_page_write(inode, page); + if (clear_page_dirty_for_io(page)) { + retval = v9fs_vfs_writepage_locked(page); + if (retval) + return retval; + } return 0; } @@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page) * with an error. * */ -ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t pos, unsigned long nr_segs) +static ssize_t +v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t pos, unsigned long nr_segs) { + /* + * FIXME + * Now that we do caching with cache mode enabled, We need + * to support direct IO + */ P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " "off/no(%lld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, @@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return -EINVAL; } + +static int v9fs_write_begin(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int retval = 0; + struct page *page; + struct v9fs_inode *v9inode; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = mapping->host; + + v9inode = V9FS_I(inode); +start: + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + retval = -ENOMEM; + goto out; + } + BUG_ON(!v9inode->writeback_fid); + if (PageUptodate(page)) + goto out; + + if (len == PAGE_CACHE_SIZE) + goto out; + + retval = v9fs_fid_readpage(v9inode->writeback_fid, page); + page_cache_release(page); + if (!retval) + goto start; +out: + *pagep = page; + return retval; +} + +static int v9fs_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + loff_t last_pos = pos + copied; + struct inode *inode = page->mapping->host; + + if (unlikely(copied < len)) { + /* + * zero out the rest of the area + */ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + flush_dcache_page(page); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) { + inode_add_bytes(inode, last_pos - inode->i_size); + i_size_write(inode, last_pos); + } + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + + const struct address_space_operations v9fs_addr_operations = { - .readpage = v9fs_vfs_readpage, - .readpages = v9fs_vfs_readpages, - .releasepage = v9fs_release_page, - .invalidatepage = v9fs_invalidate_page, - .launder_page = v9fs_launder_page, - .direct_IO = v9fs_direct_IO, + .readpage = v9fs_vfs_readpage, + .readpages = v9fs_vfs_readpages, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = v9fs_vfs_writepage, + .write_begin = v9fs_write_begin, + .write_end = v9fs_write_end, + .releasepage = v9fs_release_page, + .invalidatepage = v9fs_invalidate_page, + .launder_page = v9fs_launder_page, + .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 233b7d4ffe5..b6a3b9f7fe4 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry) * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question * - * Only return 1 if our inode is invalid. Only non-synthetic files - * (ones without mtime == 0) should be calling this function. - * */ - static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); - if(!inode) + /* Don't cache negative dentries */ + if (!dentry->d_inode) return 1; - return 0; } @@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry) } } +static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct p9_fid *fid; + struct inode *inode; + struct v9fs_inode *v9inode; + + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + if (!inode) + goto out_valid; + + v9inode = V9FS_I(inode); + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + int retval; + struct v9fs_session_info *v9ses; + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + v9ses = v9fs_inode2v9ses(inode); + if (v9fs_proto_dotl(v9ses)) + retval = v9fs_refresh_inode_dotl(fid, inode); + else + retval = v9fs_refresh_inode(fid, inode); + if (retval <= 0) + return retval; + } +out_valid: + return 1; +} + const struct dentry_operations v9fs_cached_dentry_operations = { + .d_revalidate = v9fs_lookup_revalidate, .d_delete = v9fs_cached_dentry_delete, .d_release = v9fs_dentry_release, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b84ebe8cefe..9c2bdda5cd9 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) P9_DPRINTK(P9_DEBUG_VFS, "v9fs_dir_release: inode: %p filp: %p fid: %d\n", inode, filp, fid ? fid->fid : -1); - filemap_write_and_wait(inode->i_mapping); if (fid) p9_client_clunk(fid); return 0; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 240c3067439..78bcb97c342 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -44,8 +44,7 @@ #include "fid.h" #include "cache.h" -static const struct file_operations v9fs_cached_file_operations; -static const struct file_operations v9fs_cached_file_operations_dotl; +static const struct vm_operations_struct v9fs_file_vm_ops; /** * v9fs_file_open - open a file (or directory) @@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl; int v9fs_file_open(struct inode *inode, struct file *file) { int err; + struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; struct p9_fid *fid; int omode; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = file->f_flags; @@ -89,20 +90,30 @@ int v9fs_file_open(struct inode *inode, struct file *file) } file->private_data = fid; - if ((fid->qid.version) && (v9ses->cache)) { - P9_DPRINTK(P9_DEBUG_VFS, "cached"); - /* enable cached file options */ - if(file->f_op == &v9fs_file_operations) - file->f_op = &v9fs_cached_file_operations; - else if (file->f_op == &v9fs_file_operations_dotl) - file->f_op = &v9fs_cached_file_operations_dotl; - + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(file->f_path.dentry); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + goto out_error; + } + v9inode->writeback_fid = (void *) fid; + } #ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) v9fs_cache_inode_set_cookie(inode, file); #endif - } - return 0; +out_error: + p9_client_clunk(file->private_data); + file->private_data = NULL; + return err; } /** @@ -335,25 +346,22 @@ out_err: } /** - * v9fs_file_readn - read from a file - * @filp: file pointer to read + * v9fs_fid_readn - read from a fid + * @fid: fid to read * @data: data buffer to read data into * @udata: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * */ - ssize_t -v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, +v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, u64 offset) { int n, total, size; - struct p9_fid *fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, - (long long unsigned) offset, count); - + (long long unsigned) offset, count); n = 0; total = 0; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -379,6 +387,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, } /** + * v9fs_file_readn - read from a file + * @filp: file pointer to read + * @data: data buffer to read data into + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +ssize_t +v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, + u64 offset) +{ + return v9fs_fid_readn(filp->private_data, data, udata, count, offset); +} + +/** * v9fs_file_read - read from a file * @filp: file pointer to read * @udata: user data buffer to read data into @@ -410,45 +434,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, return ret; } -/** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data - * - */ - -static ssize_t -v9fs_file_write(struct file *filp, const char __user * data, - size_t count, loff_t * offset) +ssize_t +v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, + const char __user *data, size_t count, + loff_t *offset, int invalidate) { - ssize_t retval; - size_t total = 0; int n; - struct p9_fid *fid; + loff_t i_size; + size_t total = 0; struct p9_client *clnt; - struct inode *inode = filp->f_path.dentry->d_inode; loff_t origin = *offset; unsigned long pg_start, pg_end; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); - fid = filp->private_data; clnt = fid->clnt; - - retval = generic_write_checks(filp, &origin, &count, 0); - if (retval) - goto out; - - retval = -EINVAL; - if ((ssize_t) count < 0) - goto out; - retval = 0; - if (!count) - goto out; - do { n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) @@ -457,25 +458,60 @@ v9fs_file_write(struct file *filp, const char __user * data, total += n; } while (count > 0); - if (total > 0) { + if (invalidate && (total > 0)) { pg_start = origin >> PAGE_CACHE_SHIFT; pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; if (inode->i_mapping && inode->i_mapping->nrpages) invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); *offset += total; - i_size_write(inode, i_size_read(inode) + total); - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + i_size = i_size_read(inode); + if (*offset > i_size) { + inode_add_bytes(inode, *offset - i_size); + i_size_write(inode, *offset); + } } - if (n < 0) - retval = n; - else - retval = total; + return n; + + return total; +} + +/** + * v9fs_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + ssize_t retval = 0; + loff_t origin = *offset; + + + retval = generic_write_checks(filp, &origin, &count, 0); + if (retval) + goto out; + + retval = -EINVAL; + if ((ssize_t) count < 0) + goto out; + retval = 0; + if (!count) + goto out; + + return v9fs_file_write_internal(filp->f_path.dentry->d_inode, + filp->private_data, + data, count, offset, 1); out: return retval; } + static int v9fs_file_fsync(struct file *filp, int datasync) { struct p9_fid *fid; @@ -505,28 +541,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync) return retval; } -static const struct file_operations v9fs_cached_file_operations = { +static int +v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int retval; + + retval = generic_file_mmap(file, vma); + if (!retval) + vma->vm_ops = &v9fs_file_vm_ops; + + return retval; +} + +static int +v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct v9fs_inode *v9inode; + struct page *page = vmf->page; + struct file *filp = vma->vm_file; + struct inode *inode = filp->f_path.dentry->d_inode; + + + P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); + + v9inode = V9FS_I(inode); + /* make sure the cache has finished storing the page */ + v9fs_fscache_wait_on_page_write(inode, page); + BUG_ON(!v9inode->writeback_fid); + lock_page(page); + if (page->mapping != inode->i_mapping) + goto out_unlock; + + return VM_FAULT_LOCKED; +out_unlock: + unlock_page(page); + return VM_FAULT_NOPAGE; +} + +static ssize_t +v9fs_direct_read(struct file *filp, char __user *udata, size_t count, + loff_t *offsetp) +{ + loff_t size, offset; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + size = i_size_read(inode); + if (offset < size) + filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + + return v9fs_file_read(filp, udata, count, offsetp); +} + +/** + * v9fs_cached_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + if (filp->f_flags & O_DIRECT) + return v9fs_direct_read(filp, data, count, offset); + return do_sync_read(filp, data, count, offset); +} + +static ssize_t +v9fs_direct_write(struct file *filp, const char __user * data, + size_t count, loff_t *offsetp) +{ + loff_t offset; + ssize_t retval; + struct inode *inode; + struct address_space *mapping; + + offset = *offsetp; + mapping = filp->f_mapping; + inode = mapping->host; + if (!count) + return 0; + + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, offset, + offset + count - 1); + if (retval) + goto err_out; + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that if we fail + * here we fall back to buffered write + */ + if (mapping->nrpages) { + pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT; + pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + + retval = invalidate_inode_pages2_range(mapping, + pg_start, pg_end); + /* + * If a page can not be invalidated, fall back + * to buffered write. + */ + if (retval) { + if (retval == -EBUSY) + goto buff_write; + goto err_out; + } + } + retval = v9fs_file_write(filp, data, count, offsetp); +err_out: + mutex_unlock(&inode->i_mutex); + return retval; + +buff_write: + mutex_unlock(&inode->i_mutex); + return do_sync_write(filp, data, count, offsetp); +} + +/** + * v9fs_cached_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_cached_file_write(struct file *filp, const char __user * data, + size_t count, loff_t *offset) +{ + + if (filp->f_flags & O_DIRECT) + return v9fs_direct_write(filp, data, count, offset); + return do_sync_write(filp, data, count, offset); +} + +static const struct vm_operations_struct v9fs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = v9fs_vm_page_mkwrite, +}; + + +const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, .aio_read = generic_file_aio_read, - .write = v9fs_file_write, + .aio_write = generic_file_aio_write, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, - .mmap = generic_file_readonly_mmap, + .mmap = v9fs_file_mmap, .fsync = v9fs_file_fsync, }; -static const struct file_operations v9fs_cached_file_operations_dotl = { +const struct file_operations v9fs_cached_file_operations_dotl = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = v9fs_cached_file_read, + .write = v9fs_cached_file_write, .aio_read = generic_file_aio_read, - .write = v9fs_file_write, + .aio_write = generic_file_aio_write, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, - .mmap = generic_file_readonly_mmap, + .mmap = v9fs_file_mmap, .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b76a40bdf4c..8a2c232f708 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -203,26 +203,25 @@ v9fs_blank_wstat(struct p9_wstat *wstat) wstat->extension = NULL; } -#ifdef CONFIG_9P_FSCACHE /** * v9fs_alloc_inode - helper function to allocate an inode - * This callback is executed before setting up the inode so that we - * can associate a vcookie with each inode. * */ - struct inode *v9fs_alloc_inode(struct super_block *sb) { - struct v9fs_cookie *vcookie; - vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache, - GFP_KERNEL); - if (!vcookie) + struct v9fs_inode *v9inode; + v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache, + GFP_KERNEL); + if (!v9inode) return NULL; - - vcookie->fscache = NULL; - vcookie->qid = NULL; - spin_lock_init(&vcookie->lock); - return &vcookie->inode; +#ifdef CONFIG_9P_FSCACHE + v9inode->fscache = NULL; + v9inode->fscache_key = NULL; + spin_lock_init(&v9inode->fscache_lock); +#endif + v9inode->writeback_fid = NULL; + v9inode->cache_validity = 0; + return &v9inode->vfs_inode; } /** @@ -234,35 +233,18 @@ static void v9fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); INIT_LIST_HEAD(&inode->i_dentry); - kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); + kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } void v9fs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, v9fs_i_callback); } -#endif -/** - * v9fs_get_inode - helper function to setup an inode - * @sb: superblock - * @mode: mode to setup inode with - * - */ - -struct inode *v9fs_get_inode(struct super_block *sb, int mode) +int v9fs_init_inode(struct v9fs_session_info *v9ses, + struct inode *inode, int mode) { - int err; - struct inode *inode; - struct v9fs_session_info *v9ses = sb->s_fs_info; - - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); - - inode = new_inode(sb); - if (!inode) { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); - return ERR_PTR(-ENOMEM); - } + int err = 0; inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; @@ -292,14 +274,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - inode->i_fop = &v9fs_file_operations_dotl; + if (v9ses->cache) + inode->i_fop = + &v9fs_cached_file_operations_dotl; + else + inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; + if (v9ses->cache) + inode->i_fop = &v9fs_cached_file_operations; + else + inode->i_fop = &v9fs_file_operations; } break; - case S_IFLNK: if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " @@ -335,12 +323,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) err = -EINVAL; goto error; } +error: + return err; - return inode; +} -error: - iput(inode); - return ERR_PTR(err); +/** + * v9fs_get_inode - helper function to setup an inode + * @sb: superblock + * @mode: mode to setup inode with + * + */ + +struct inode *v9fs_get_inode(struct super_block *sb, int mode) +{ + int err; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + + inode = new_inode(sb); + if (!inode) { + P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + return ERR_PTR(-ENOMEM); + } + err = v9fs_init_inode(v9ses, inode, mode); + if (err) { + iput(inode); + return ERR_PTR(err); + } + return inode; } /* @@ -403,6 +416,8 @@ error: */ void v9fs_evict_inode(struct inode *inode) { + struct v9fs_inode *v9inode = V9FS_I(inode); + truncate_inode_pages(inode->i_mapping, 0); end_writeback(inode); filemap_fdatawrite(inode->i_mapping); @@ -410,41 +425,67 @@ void v9fs_evict_inode(struct inode *inode) #ifdef CONFIG_9P_FSCACHE v9fs_cache_inode_put_cookie(inode); #endif + /* clunk the fid stashed in writeback_fid */ + if (v9inode->writeback_fid) { + p9_client_clunk(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; + } } -struct inode * -v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st) { - int err, umode; - struct inode *ret = NULL; - struct p9_wstat *st; - - st = p9_client_stat(fid); - if (IS_ERR(st)) - return ERR_CAST(st); + int retval, umode; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + i_ino = v9fs_qid2ino(qid); + inode = iget_locked(sb, i_ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ umode = p9mode2unixmode(v9ses, st->mode); - ret = v9fs_get_inode(sb, umode); - if (IS_ERR(ret)) { - err = PTR_ERR(ret); + retval = v9fs_init_inode(v9ses, inode, umode); + if (retval) goto error; - } - - v9fs_stat2inode(st, ret, sb); - ret->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, inode, sb); #ifdef CONFIG_9P_FSCACHE - v9fs_vcookie_set_qid(ret, &st->qid); - v9fs_cache_inode_get_cookie(ret); + v9fs_fscache_set_key(inode, &st->qid); + v9fs_cache_inode_get_cookie(inode); #endif - p9stat_free(st); - kfree(st); - return ret; + unlock_new_inode(inode); + return inode; error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st); p9stat_free(st); kfree(st); - return ERR_PTR(err); + return inode; } /** @@ -458,8 +499,8 @@ error: static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { int retval; - struct inode *file_inode; struct p9_fid *v9fid; + struct inode *file_inode; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); @@ -470,8 +511,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) return PTR_ERR(v9fid); retval = p9_client_remove(v9fid); - if (!retval) - drop_nlink(file_inode); + if (!retval) { + /* + * directories on unlink should have zero + * link count + */ + if (rmdir) { + clear_nlink(file_inode); + drop_nlink(dir); + } else + drop_nlink(file_inode); + + v9fs_invalidate_inode_attr(file_inode); + v9fs_invalidate_inode_attr(dir); + } return retval; } @@ -531,7 +584,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -570,9 +623,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, int err; u32 perm; int flags; - struct v9fs_session_info *v9ses; - struct p9_fid *fid; struct file *filp; + struct v9fs_inode *v9inode; + struct v9fs_session_info *v9ses; + struct p9_fid *fid, *inode_fid; err = 0; fid = NULL; @@ -592,8 +646,25 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, goto error; } + v9fs_invalidate_inode_attr(dir); /* if we are opening a file, assign the open fid to the file */ if (nd && nd->flags & LOOKUP_OPEN) { + v9inode = V9FS_I(dentry->d_inode); + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { err = PTR_ERR(filp); @@ -601,6 +672,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, } filp->private_data = fid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(dentry->d_inode, filp); +#endif } else p9_client_clunk(fid); @@ -625,8 +700,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { int err; u32 perm; - struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct v9fs_session_info *v9ses; P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; @@ -636,6 +711,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; + } else { + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); } if (fid) @@ -687,7 +765,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(result); } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { result = PTR_ERR(inode); inode = NULL; @@ -747,17 +825,19 @@ int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int retval; struct inode *old_inode; + struct inode *new_inode; struct v9fs_session_info *v9ses; struct p9_fid *oldfid; struct p9_fid *olddirfid; struct p9_fid *newdirfid; struct p9_wstat wstat; - int retval; P9_DPRINTK(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; v9ses = v9fs_inode2v9ses(old_inode); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) @@ -798,9 +878,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: - if (!retval) + if (!retval) { + if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else + drop_nlink(new_inode); + /* + * Work around vfs rename rehash bug with + * FS_RENAME_DOES_D_MOVE + */ + v9fs_invalidate_inode_attr(new_inode); + } + if (S_ISDIR(old_inode->i_mode)) { + if (!new_inode) + inc_nlink(new_dir); + drop_nlink(old_dir); + } + v9fs_invalidate_inode_attr(old_inode); + v9fs_invalidate_inode_attr(old_dir); + v9fs_invalidate_inode_attr(new_dir); + /* successful rename */ d_move(old_dentry, new_dentry); + } up_write(&v9ses->rename_sem); p9_client_clunk(newdirfid); @@ -831,9 +932,10 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - return simple_getattr(mnt, dentry, stat); - + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -891,17 +993,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (iattr->ia_valid & ATTR_GID) wstat.n_gid = iattr->ia_gid; } - - retval = p9_client_wstat(fid, &wstat); - if (retval < 0) - return retval; - if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(dentry->d_inode)) { retval = vmtruncate(dentry->d_inode, iattr->ia_size); if (retval) return retval; } + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_wstat(fid, &wstat); + if (retval < 0) + return retval; + v9fs_invalidate_inode_attr(dentry->d_inode); setattr_copy(dentry->d_inode, iattr); mark_inode_dirty(dentry->d_inode); @@ -924,6 +1029,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, char tag_name[14]; unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; + struct v9fs_inode *v9inode = V9FS_I(inode); inode->i_nlink = 1; @@ -983,6 +1089,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, /* not real number of blocks, but 512 byte ones ... */ inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } /** @@ -1115,8 +1222,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int mode, const char *extension) { u32 perm; - struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(dir); if (!v9fs_proto_dotu(v9ses)) { @@ -1130,6 +1237,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, if (IS_ERR(fid)) return PTR_ERR(fid); + v9fs_invalidate_inode_attr(dir); p9_client_clunk(fid); return 0; } @@ -1166,8 +1274,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct p9_fid *oldfid; char *name; + struct p9_fid *oldfid; P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, @@ -1186,7 +1294,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); __putname(name); - + if (!retval) { + v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_invalidate_inode_attr(dir); + } clunk_fid: p9_client_clunk(oldfid); return retval; @@ -1237,6 +1348,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } +int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_wstat *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_stat(fid); + if (IS_ERR(st)) + return PTR_ERR(st); + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode(st, inode, inode->i_sb); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); + p9stat_free(st); + kfree(st); + return 0; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index fe3ffa9aace..67c138e94fe 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) return dentry; } +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st) +{ + int retval; + unsigned long i_ino; + struct inode *inode; + struct v9fs_session_info *v9ses = sb->s_fs_info; + + i_ino = v9fs_qid2ino(qid); + inode = iget_locked(sb, i_ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + /* + * initialize the inode with the stat info + * FIXME!! we may need support for stale inodes + * later. + */ + retval = v9fs_init_inode(v9ses, inode, st->st_mode); + if (retval) + goto error; + + v9fs_stat2inode_dotl(st, inode); +#ifdef CONFIG_9P_FSCACHE + v9fs_fscache_set_key(inode, &st->qid); + v9fs_cache_inode_get_cookie(inode); +#endif + retval = v9fs_get_acl(inode, fid); + if (retval) + goto error; + + unlock_new_inode(inode); + return inode; +error: + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(retval); + +} + struct inode * -v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) { - struct inode *ret = NULL; - int err; struct p9_stat_dotl *st; + struct inode *inode = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) return ERR_CAST(st); - ret = v9fs_get_inode(sb, st->st_mode); - if (IS_ERR(ret)) { - err = PTR_ERR(ret); - goto error; - } - - v9fs_stat2inode_dotl(st, ret); - ret->i_ino = v9fs_qid2ino(&st->qid); -#ifdef CONFIG_9P_FSCACHE - v9fs_vcookie_set_qid(ret, &st->qid); - v9fs_cache_inode_get_cookie(ret); -#endif - err = v9fs_get_acl(ret, fid); - if (err) { - iput(ret); - goto error; - } - kfree(st); - return ret; -error: + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st); kfree(st); - return ERR_PTR(err); + return inode; } /** @@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, struct nameidata *nd) { int err = 0; - char *name = NULL; gid_t gid; int flags; mode_t mode; - struct v9fs_session_info *v9ses; - struct p9_fid *fid = NULL; - struct p9_fid *dfid, *ofid; + char *name = NULL; struct file *filp; struct p9_qid qid; struct inode *inode; + struct p9_fid *fid = NULL; + struct v9fs_inode *v9inode; + struct p9_fid *dfid, *ofid, *inode_fid; + struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; v9ses = v9fs_inode2v9ses(dir); @@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, err); goto error; } + v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); @@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = NULL; goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -219,6 +244,22 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, dacl, pacl); + v9inode = V9FS_I(inode); + if (v9ses->cache && !v9inode->writeback_fid) { + /* + * clone a fid and add it to writeback_fid + * we do it during open time instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + inode_fid = v9fs_writeback_fid(dentry); + if (IS_ERR(inode_fid)) { + err = PTR_ERR(inode_fid); + goto error; + } + v9inode->writeback_fid = (void *) inode_fid; + } /* Since we are opening a file, assign the open fid to the file */ filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { @@ -226,6 +267,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, return PTR_ERR(filp); } filp->private_data = ofid; +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache) + v9fs_cache_inode_set_cookie(inode, filp); +#endif return 0; error: @@ -300,7 +345,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -327,7 +372,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, } /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, dacl, pacl); - + inc_nlink(dir); + v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); @@ -346,9 +392,10 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_inode2v9ses(dentry->d_inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - return simple_getattr(mnt, dentry, stat); - + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + generic_fillattr(dentry->d_inode, stat); + return 0; + } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -406,16 +453,20 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) if (IS_ERR(fid)) return PTR_ERR(fid); - retval = p9_client_setattr(fid, &p9attr); - if (retval < 0) - return retval; - if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(dentry->d_inode)) { retval = vmtruncate(dentry->d_inode, iattr->ia_size); if (retval) return retval; } + /* Write all dirty data */ + if (S_ISREG(dentry->d_inode->i_mode)) + filemap_write_and_wait(dentry->d_inode->i_mapping); + + retval = p9_client_setattr(fid, &p9attr); + if (retval < 0) + return retval; + v9fs_invalidate_inode_attr(dentry->d_inode); setattr_copy(dentry->d_inode, iattr); mark_inode_dirty(dentry->d_inode); @@ -439,6 +490,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { + struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { inode->i_atime.tv_sec = stat->st_atime_sec; @@ -497,20 +549,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. */ + v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } static int v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, const char *symname) { - struct v9fs_session_info *v9ses; - struct p9_fid *dfid; - struct p9_fid *fid = NULL; - struct inode *inode; - struct p9_qid qid; - char *name; int err; gid_t gid; + char *name; + struct p9_qid qid; + struct inode *inode; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct v9fs_session_info *v9ses; name = (char *) dentry->d_name.name; P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", @@ -534,6 +587,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, goto error; } + v9fs_invalidate_inode_attr(dir); if (v9ses->cache) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); @@ -546,7 +600,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -588,10 +642,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - struct p9_fid *dfid, *oldfid; char *name; - struct v9fs_session_info *v9ses; struct dentry *dir_dentry; + struct p9_fid *dfid, *oldfid; + struct v9fs_session_info *v9ses; P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", dir->i_ino, old_dentry->d_name.name, @@ -616,29 +670,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, return err; } + v9fs_invalidate_inode_attr(dir); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Get the latest stat info from server. */ struct p9_fid *fid; - struct p9_stat_dotl *st; - fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); - if (IS_ERR(st)) - return PTR_ERR(st); - - v9fs_stat2inode_dotl(st, old_dentry->d_inode); - - kfree(st); - } else { - /* Caching disabled. No need to get upto date stat info. - * This dentry will be released immediately. So, just hold the - * inode - */ - ihold(old_dentry->d_inode); + v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); } + ihold(old_dentry->d_inode); d_instantiate(dentry, old_dentry->d_inode); return err; @@ -657,12 +699,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, dev_t rdev) { int err; + gid_t gid; char *name; mode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; - gid_t gid; struct p9_qid qid; struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; @@ -699,6 +741,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, if (err < 0) goto error; + v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { fid = p9_client_walk(dfid, 1, &name, 1); @@ -710,7 +753,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, goto error; } - inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -782,6 +825,31 @@ ndset: return NULL; } +int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) +{ + loff_t i_size; + struct p9_stat_dotl *st; + struct v9fs_session_info *v9ses; + + v9ses = v9fs_inode2v9ses(inode); + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + spin_lock(&inode->i_lock); + /* + * We don't want to refresh inode->i_size, + * because we may have cached data + */ + i_size = inode->i_size; + v9fs_stat2inode_dotl(st, inode); + if (v9ses->cache) + inode->i_size = i_size; + spin_unlock(&inode->i_lock); + kfree(st); + return 0; +} + const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, .lookup = v9fs_vfs_lookup, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index dbaabe3b813..09fd08d1606 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, } else sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; + if (v9ses->cache) + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE; - sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | - MS_NOATIME; + sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + if (!v9ses->cache) + sb->s_flags |= MS_SYNCHRONOUS; #ifdef CONFIG_9P_FS_POSIX_ACL - if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT) + if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) sb->s_flags |= MS_POSIXACL; #endif @@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(inode); goto release_sb; } - root = d_alloc_root(inode); if (!root) { iput(inode); @@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(st); goto release_sb; } - + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode_dotl(st, root->d_inode); kfree(st); } else { @@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, p9stat_free(st); kfree(st); } + v9fs_fid_add(root, fid); retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; - v9fs_fid_add(root, fid); + /* + * Add the root fid to session info. This is used + * for file system sync. We want a cloned fid here + * so that we can do a sync_filesystem after a + * shrink_dcache_for_umount + */ + v9ses->root_fid = v9fs_fid_clone(root); + if (IS_ERR(v9ses->root_fid)) { + retval = PTR_ERR(v9ses->root_fid); + goto release_sb; + } P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); @@ -197,15 +210,11 @@ close_session: v9fs_session_close(v9ses); kfree(v9ses); return ERR_PTR(retval); - release_sb: /* - * we will do the session_close and root dentry release - * in the below call. But we need to clunk fid, because we haven't - * attached the fid to dentry so it won't get clunked - * automatically. + * we will do the session_close and root dentry + * release in the below call. */ - p9_client_clunk(fid); deactivate_locked_super(sb); return ERR_PTR(retval); } @@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); - + p9_client_clunk(v9ses->root_fid); v9fs_session_cancel(v9ses); v9fs_session_close(v9ses); kfree(v9ses); @@ -276,11 +285,31 @@ done: return res; } +static int v9fs_sync_fs(struct super_block *sb, int wait) +{ + struct v9fs_session_info *v9ses = sb->s_fs_info; + + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb); + return p9_client_sync_fs(v9ses->root_fid); +} + +static int v9fs_drop_inode(struct inode *inode) +{ + struct v9fs_session_info *v9ses; + v9ses = v9fs_inode2v9ses(inode); + if (v9ses->cache) + return generic_drop_inode(inode); + /* + * in case of non cached mode always drop the + * the inode because we want the inode attribute + * to always match that on the server. + */ + return 1; +} + static const struct super_operations v9fs_super_ops = { -#ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, .destroy_inode = v9fs_destroy_inode, -#endif .statfs = simple_statfs, .evict_inode = v9fs_evict_inode, .show_options = generic_show_options, @@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = { }; static const struct super_operations v9fs_super_ops_dotl = { -#ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, .destroy_inode = v9fs_destroy_inode, -#endif + .sync_fs = v9fs_sync_fs, .statfs = v9fs_statfs, + .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, @@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = { .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, - .fs_flags = FS_RENAME_DOES_D_MOVE, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT, }; diff --git a/fs/Kconfig b/fs/Kconfig index 3db9caa57ed..7cb53aafac1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT diff --git a/fs/Makefile b/fs/Makefile index a7f7cef0c0c..ba01202844c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_FHANDLE) += fhandle.o + obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ diff --git a/fs/afs/write.c b/fs/afs/write.c index 15690bb1d3b..789b3afb342 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->first = candidate->last = index; candidate->offset_first = from; candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); @@ -85,7 +85,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = create_workqueue("aio"); + aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); BUG_ON(!aio_wq || !abe_pool); @@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -#define get_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - atomic_inc(&(kioctx)->users); \ -} while (0) -#define put_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ - __put_ioctx(kioctx); \ -} while (0) +static inline void get_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + atomic_inc(&kioctx->users); +} + +static inline int try_get_ioctx(struct kioctx *kioctx) +{ + return atomic_inc_not_zero(&kioctx->users); +} + +static inline void put_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + if (unlikely(atomic_dec_and_test(&kioctx->users))) + __put_ioctx(kioctx); +} /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. @@ -569,7 +577,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); - queue_work(aio_wq, &fput_work); + schedule_work(&fput_work); } else { req->ki_filp = NULL; really_put_req(ctx, req); @@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id && !ctx->dead) { - get_ioctx(ctx); + /* + * RCU protects us against accessing freed memory but + * we have to be careful not to get a reference when the + * reference count already dropped to 0 (ctx->dead test + * is unreliable because of races). + */ + if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ ret = ctx; break; } @@ -1629,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; spin_lock_irq(&ctx->ctx_lock); + /* + * We could have raced with io_destroy() and are currently holding a + * reference to ctx which should be destroyed. We cannot submit IO + * since ctx gets freed as soon as io_submit() puts its reference. The + * check here is reliable: io_destroy() sets ctx->dead before waiting + * for outstanding IO and the barrier between these two is realized by + * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we + * increment ctx->reqs_active before checking for ctx->dead and the + * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we + * don't see ctx->dead set here, io_destroy() waits for our IO to + * finish. + */ + if (ctx->dead) { + spin_unlock_irq(&ctx->ctx_lock); + ret = -EINVAL; + goto out_put_req; + } aio_run_iocb(req); if (!list_empty(&ctx->run_list)) { /* drain the run list */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 333a7bb4cb9..88928701959 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } @@ -922,14 +928,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * flush_disk - invalidates all buffer-cache entries on a disk * * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes * * Invalidates all buffer-cache entries on a disk. It should be called * when a disk has been changed -- either by a media change or online * resize. */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty) { - if (__invalidate_device(bdev)) { + if (__invalidate_device(bdev, kill_dirty)) { char name[BDEVNAME_SIZE] = ""; if (bdev->bd_disk) @@ -966,7 +973,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1019,7 +1026,7 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev); + flush_disk(bdev, true); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1215,12 +1222,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) res = __blkdev_get(bdev, mode, 0); - /* __blkdev_get() may alter read only status, check it afterwards */ - if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - __blkdev_put(bdev, mode, 0); - res = -EACCES; - } - if (whole) { /* finish claiming */ mutex_lock(&bdev->bd_mutex); @@ -1298,6 +1299,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, if (err) return ERR_PTR(err); + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } + return bdev; } EXPORT_SYMBOL(blkdev_get_by_path); @@ -1601,7 +1607,7 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; @@ -1614,7 +1620,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 15b5ca2a260..9c949348510 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (!IS_POSIXACL(inode)) + return NULL; + acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) return acl; @@ -84,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int ret = 0; + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f745287fbf2..4d2110eafe2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret; + int ret = -ENOMEM; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; - cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; for (page_index = 0; page_index < nr_pages; page_index++) { cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!cb->compressed_pages[page_index]) + goto fail2; } cb->nr_pages = nr_pages; @@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_put(comp_bio); return 0; + +fail2: + for (page_index = 0; page_index < nr_pages; page_index++) + free_page((unsigned long)cb->compressed_pages[page_index]); + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; } static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; @@ -900,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __exit btrfs_exit_compress(void) +void btrfs_exit_compress(void) { free_workspaces(); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c98b3af605..7f78cc78fdd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,6 +729,15 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for @@ -1254,6 +1263,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b531c36455d..e1aa8d607bc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); goto out; - if (!page->private) + } + if (!page->private) { + WARN_ON(1); goto out; + } len = page->private >> 2; WARN_ON(len == 0); @@ -1550,6 +1554,7 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->new_trans_lock); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -2453,10 +2458,14 @@ int btrfs_commit_super(struct btrfs_root *root) up_write(&root->fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2554,6 +2563,8 @@ int close_ctree(struct btrfs_root *root) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info); + return 0; } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 9786963b07e..b4ffad859ad 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; @@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child) int ret; path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b55269340ce..7b3089b5c2d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,11 +320,6 @@ static int caching_kthread(void *data) if (!path) return -ENOMEM; - exclude_super_stripes(extent_root, block_group); - spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_readonly += block_group->bytes_super; - spin_unlock(&block_group->space_info->lock); - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* @@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } spin_unlock(&cache->lock); - if (ret == 1) + if (ret == 1) { + free_excluded_extents(fs_info->extent_root, cache); return 0; + } } if (load_cache_only) @@ -3344,21 +3341,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 reserved; u64 max_reclaim; u64 reclaimed = 0; - int pause = 1; + long time_left; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; max_reclaim = min(reserved, to_reclaim); - while (1) { + while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, @@ -3371,17 +3371,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + time_left = schedule_timeout_interruptible(1); + + /* We were interrupted, exit */ + if (time_left) + break; + + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3588,10 +3602,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes > 0) { if (dest) { - block_rsv_add_bytes(dest, num_bytes, 0); - } else { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3824,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -3985,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = 0; } spin_unlock(&BTRFS_I(inode)->accounting_lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) @@ -4012,6 +4039,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); @@ -4112,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4163,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4213,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4691,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out; @@ -5355,7 +5387,7 @@ again: num_bytes, data, 1); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -5633,6 +5665,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize) { struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; block_rsv = get_block_rsv(trans, root); @@ -5640,14 +5673,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->size == 0) { ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); - if (ret) + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + return ERR_PTR(ret); + } else if (ret) { return ERR_PTR(ret); + } return block_rsv; } ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; + if (ret) { + WARN_ON(1); + ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, + 0); + if (!ret) { + spin_lock(&block_rsv->lock); + block_rsv->size += blocksize; + spin_unlock(&block_rsv->lock); + return block_rsv; + } else if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + } return ERR_PTR(-ENOSPC); } @@ -6221,6 +6279,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, BUG_ON(!wc); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + if (block_rsv) trans->block_rsv = block_rsv; @@ -6318,6 +6378,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); if (block_rsv) trans->block_rsv = block_rsv; } @@ -6446,6 +6507,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start, int ret = 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; mutex_lock(&inode->i_mutex); first_index = start >> PAGE_CACHE_SHIFT; @@ -6531,7 +6594,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, u64 end = start + extent_key->offset - 1; em = alloc_extent_map(GFP_NOFS); - BUG_ON(!em || IS_ERR(em)); + BUG_ON(!em); em->start = start; em->len = extent_key->offset; @@ -7477,7 +7540,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, reloc_root); @@ -7535,7 +7598,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) if (found) { trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); } @@ -7779,7 +7842,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); if (extent_key->objectid == 0) { ret = del_extent_zero(trans, extent_root, path, extent_key); @@ -8013,6 +8076,13 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +} + /* * helper to account the unused space of all the readonly block group in the * list. takes mirrors into account. @@ -8270,6 +8340,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO) + free_excluded_extents(info->extent_root, block_group); + btrfs_remove_free_space_cache(block_group); btrfs_put_block_group(block_group); @@ -8385,6 +8462,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->sectorsize = root->sectorsize; /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + exclude_super_stripes(root, cache); + + /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all @@ -8392,12 +8476,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2e993cf1766..714adc4ac4c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -1865,7 +1871,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else submit_bio(rw, bio); @@ -1920,6 +1926,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, nr = bio_get_nr_vecs(bdev); bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -1944,6 +1952,7 @@ void set_page_extent_mapped(struct page *page) static void set_page_extent_head(struct page *page, unsigned long len) { + WARN_ON(!PagePrivate(page)); set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } @@ -2126,7 +2135,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, 0, bio_flags); return ret; } @@ -2819,9 +2828,17 @@ int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - clear_extent_bit(tree, start, end, + ret = clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; } return ret; } @@ -2901,6 +2918,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2910,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2929,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2942,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2962,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; - if (off >= max) - end = 1; + u64 offset_in_extent; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; - em_start = em->start; - em_len = em->len; + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; + emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -2988,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); @@ -3192,7 +3284,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, } if (!PageUptodate(p)) uptodate = 0; - unlock_page(p); + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + if (i != 0) + unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -3216,9 +3314,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); return eb; free_eb: + if (eb->first_page && !page0) + unlock_page(eb->first_page); + if (!atomic_dec_and_test(&eb->refs)) return exists; btrfs_release_extent_buffer(eb); @@ -3269,10 +3384,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, continue; lock_page(page); + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); if (i == 0) set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); @@ -3462,6 +3578,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + if (inc_all_pages) page_cache_get(page); if (!PageUptodate(page)) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7083cfafd06..9318dfefd59 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -191,7 +191,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b0e1fce1253..2b6c12e983b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; em = kmem_cache_alloc(extent_map_cache, mask); - if (!em || IS_ERR(em)) - return em; + if (!em) + return NULL; em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a562a250ae7..4f19a3e1bf3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, root = root->fs_info->csum_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -548,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (path->slots[0] == 0) goto out; path->slots[0]--; + } else if (ret < 0) { + goto out; } + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c800d58f301..f447b783bb8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; @@ -186,6 +199,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(GFP_NOFS); if (!split2) split2 = alloc_extent_map(GFP_NOFS); + BUG_ON(!split || !split2); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -762,6 +776,27 @@ out: } /* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be * modified. @@ -776,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -793,11 +829,24 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { + faili = i - 1; err = -ENOMEM; - BUG_ON(1); + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -837,6 +886,14 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } static ssize_t btrfs_file_aio_write(struct kiocb *iocb, @@ -846,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; struct iov_iter i; loff_t *ppos = &iocb->ki_pos; @@ -867,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - pinned[0] = NULL; - pinned[1] = NULL; - start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -946,6 +999,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } /* generic_write_checks can change our pos */ start_pos = pos; @@ -953,39 +1010,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } - while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); @@ -1015,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1060,10 +1103,6 @@ out: err = ret; kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); *ppos = pos; /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 60d68426695..a0390657451 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, return entry; } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) +static inline void +__unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); block_group->free_extents--; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + __unlink_free_space(block_group, info); block_group->free_space -= info->bytes; } @@ -1016,14 +1023,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; + u64 size = block_group->key.offset; /* * The goal is to keep the total amount of memory used per 1gb of space * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - max_bytes = MAX_CACHE_BYTES_PER_GIG * - (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1171,6 +1182,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, recalculate_thresholds(block_group); } +static void free_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); +} + static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) @@ -1195,6 +1216,7 @@ again: */ search_start = *offset; search_bytes = *bytes; + search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(block_group, bitmap_info, &search_start, &search_bytes); BUG_ON(ret < 0 || search_start != *offset); @@ -1211,13 +1233,8 @@ again: if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); /* * no entry after this bitmap, but we still have bytes to @@ -1250,13 +1267,8 @@ again: return -EAGAIN; goto again; - } else if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + } else if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); return 0; } @@ -1359,22 +1371,14 @@ out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +bool try_merge_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *right_info = NULL; - struct btrfs_free_space *left_info = NULL; - struct btrfs_free_space *info = NULL; - int ret = 0; - - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&block_group->tree_lock); + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; /* * first we want to see if there is free space adjacent to the range we @@ -1388,37 +1392,62 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, else left_info = tree_search_offset(block_group, offset - 1, 0, 0); - /* - * If there was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - if ((!left_info || left_info->bitmap) && - (!right_info || right_info->bitmap)) { - ret = insert_into_bitmap(block_group, info); - - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } - } - if (right_info && !right_info->bitmap) { - unlink_free_space(block_group, right_info); + if (update_stat) + unlink_free_space(block_group, right_info); + else + __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); + merged = true; } if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset) { - unlink_free_space(block_group, left_info); + if (update_stat) + unlink_free_space(block_group, left_info); + else + __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; kfree(left_info); + merged = true; } + return merged; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + + if (try_merge_free_space(block_group, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(block_group, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: ret = link_free_space(block_group, info); if (ret) kfree(info); @@ -1621,6 +1650,7 @@ __btrfs_return_cluster_to_free_space( node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); BUG_ON(entry->bitmap); + try_merge_free_space(block_group, entry, false); tree_insert_offset(&block_group->free_space_offset, entry->offset, &entry->offset_index, 0); } @@ -1685,13 +1715,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(block_group, entry, offset, bytes); - if (!entry->bytes) { - unlink_free_space(block_group, entry); - kfree(entry->bitmap); - kfree(entry); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!entry->bytes) + free_bitmap(block_group, entry); } else { unlink_free_space(block_group, entry); entry->offset += bytes; @@ -1789,6 +1814,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, ret = search_start; bitmap_clear_bits(block_group, entry, ret, bytes); + if (entry->bytes == 0) + free_bitmap(block_group, entry); out: spin_unlock(&cluster->lock); spin_unlock(&block_group->tree_lock); @@ -1842,15 +1869,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry->offset += bytes; entry->bytes -= bytes; - if (entry->bytes == 0) { + if (entry->bytes == 0) rb_erase(&entry->offset_index, &cluster->root); - kfree(entry); - } break; } out: spin_unlock(&cluster->lock); + if (!ret) + return 0; + + spin_lock(&block_group->tree_lock); + + block_group->free_space -= bytes; + if (entry->bytes == 0) { + block_group->free_extents--; + kfree(entry); + } + + spin_unlock(&block_group->tree_lock); + return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 160b55b3e13..512c3d1da08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode, unsigned long *nr_written, int unlock); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(trans, inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err; } @@ -416,7 +417,7 @@ again: } if (start == 0) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -612,6 +613,7 @@ retry: GFP_NOFS); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -643,6 +645,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -771,7 +774,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -819,6 +822,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(ret); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -1049,7 +1053,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, } else { trans = btrfs_join_transaction(root, 1); } - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); cow_start = (u64)-1; cur_offset = start; @@ -1168,6 +1172,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1557,6 +1562,7 @@ out: out_page: unlock_page(page); page_cache_release(page); + kfree(fixup); } /* @@ -1703,7 +1709,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1720,6 +1726,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1907,7 +1914,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -2354,6 +2361,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2381,6 +2389,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_end_transaction(trans, root); } @@ -2641,7 +2650,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto err; + goto out; } path->leave_spinning = 1; @@ -2714,9 +2723,10 @@ static int check_path_shared(struct btrfs_root *root, struct extent_buffer *eb; int level; u64 refs = 1; - int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + int ret; + if (!path->nodes[level]) break; eb = path->nodes[level]; @@ -2727,7 +2737,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return ret; /* XXX callers? */ + return 0; } /* @@ -4134,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); - if (root != sub_root) { + if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) btrfs_orphan_cleanup(sub_root); @@ -4347,6 +4357,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); @@ -4372,6 +4384,7 @@ void btrfs_dirty_inode(struct inode *inode) return; trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); @@ -4692,7 +4705,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4753,7 +4766,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4794,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; @@ -4809,10 +4819,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; @@ -4881,7 +4892,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_fail; @@ -5176,6 +5187,8 @@ again: em = NULL; btrfs_release_path(root, path); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return ERR_CAST(trans); goto again; } map = kmap(page); @@ -5266,6 +5279,128 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -5280,8 +5415,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); trans = btrfs_join_transaction(root, 0); - if (!trans) - return ERR_PTR(-ENOMEM); + if (IS_ERR(trans)) + return ERR_CAST(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -5505,7 +5640,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * while we look for nocow cross refs */ trans = btrfs_join_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { @@ -5640,7 +5775,7 @@ again: BUG_ON(!ordered); trans = btrfs_join_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { err = -ENOMEM; goto out; } @@ -5920,6 +6055,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (!skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } @@ -6088,7 +6224,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) @@ -6968,7 +7104,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a506a22b522..5fdb2abc4fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -1067,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) return -EINVAL; if (flags & ~BTRFS_SUBVOL_RDONLY) return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1093,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } - ret = btrfs_update_root(trans, root, + ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); btrfs_commit_transaction(trans, root); @@ -1898,7 +1905,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -2082,7 +2092,7 @@ static long btrfs_ioctl_trans_start(struct file *file) ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto out_drop; file->private_data = trans; @@ -2138,9 +2148,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { btrfs_free_path(path); - return -ENOMEM; + return PTR_ERR(trans); } dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); @@ -2201,7 +2211,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) int num_types = 4; int alloc_size; int ret = 0; - int slot_count = 0; + u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, @@ -2240,7 +2250,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) goto out; } - slot_count = min_t(int, space_args.space_slots, slot_count); + slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; @@ -2260,6 +2270,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; + if (!slot_count) + break; + info = NULL; rcu_read_lock(); list_for_each_entry_rcu(tmp, &root->fs_info->space_info, @@ -2281,7 +2294,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; + slot_count--; } + if (!slot_count) + break; } up_read(&info->groups_sem); } @@ -2334,6 +2350,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp u64 transid; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); transid = trans->transid; btrfs_commit_transaction_async(trans, root, 0); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cc9b450399d..a178f5ebea7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws, unsigned long tot_out; unsigned long tot_len; char *buf; + bool may_late_unmap, need_unmap; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); @@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in += in_len; working_bytes = in_len; + may_late_unmap = need_unmap = false; /* fast path: avoid using the working buffer */ if (in_page_bytes_left >= in_len) { buf = data_in + in_offset; bytes = in_len; + may_late_unmap = true; goto cont; } @@ -329,14 +332,17 @@ cont: if (working_bytes == 0 && tot_in >= tot_len) break; - kunmap(pages_in[page_in_index]); - page_in_index++; - if (page_in_index >= total_pages_in) { + if (page_in_index + 1 >= total_pages_in) { ret = -1; - data_in = NULL; goto done; } - data_in = kmap(pages_in[page_in_index]); + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); in_page_bytes_left = PAGE_CACHE_SIZE; in_offset = 0; @@ -346,6 +352,8 @@ cont: out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "btrfs decompress failed\n"); ret = -1; @@ -363,8 +371,7 @@ cont: break; } done: - if (data_in) - kunmap(pages_in[page_in_index]); + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2b61e1ddcd9..083a5547737 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { struct rb_root *root = &tree->tree; - struct rb_node *prev; + struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0d126be22b6..fb2605d998e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) #else BUG(); #endif + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 045c9c2b2d7..31ade5802ae 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->bytenr = dest->node->start; new_node->level = node->level; new_node->lowest = node->lowest; + new_node->checked = 1; new_node->root = dest; if (!node->lowest) { @@ -2028,6 +2029,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, @@ -2147,6 +2149,12 @@ again: } trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } if (!err) { if (num_bytes != rc->merging_rsv_size) { @@ -3222,6 +3230,7 @@ truncate: trans = btrfs_join_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); + ret = PTR_ERR(trans); goto out; } @@ -3628,6 +3637,7 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3644,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3656,8 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - + BUG_ON(IS_ERR(trans)); +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3770,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, @@ -3804,7 +3826,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4022,6 +4047,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) int ret; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4125,6 +4151,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } rc->merge_reloc_tree = 1; @@ -4154,9 +4185,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); -out: + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: kfree(rc); +out: while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b2130c46fdb..d39a9895d93 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -155,7 +155,8 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_err, }; static match_table_t tokens = { @@ -184,6 +185,7 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, {Opt_err, NULL}, }; @@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -383,7 +388,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *p; + char *opts, *orig, *p; int error = 0; int intarg; @@ -397,6 +402,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; + orig = opts; while ((p = strsep(&opts, ",")) != NULL) { int token; @@ -432,7 +438,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } out_free_opts: - kfree(opts); + kfree(orig); out: /* * If no subvolume name is specified we use the default one. Allocate @@ -623,6 +629,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -761,6 +769,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); } else { char b[BDEVNAME_SIZE]; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bae5c7b8bbe..3d73c8d93bb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root, 0); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } /* take transaction reference */ mutex_lock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 054744ac571..a4bbb854dfd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(root, path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } read_extent_buffer(eb, src_copy, src_ptr, item_size); @@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(root, path); @@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log, int match = 0; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; @@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset = (u64)-1; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); @@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; if (*level == 1) { wc->process_func(root, next, wc, ptr_gen); @@ -2032,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = 0; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); @@ -2096,7 +2116,7 @@ out: smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); - return 0; + return ret; } static void free_log_tree(struct btrfs_trans_handle *trans, @@ -2194,6 +2214,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2594,6 +2617,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -2725,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } min_key.objectid = inode->i_ino; min_key.type = BTRFS_INODE_ITEM_KEY; @@ -3080,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) BUG_ON(!path); trans = btrfs_start_transaction(fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d158530233b..dd13eb81ee4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1213,6 +1213,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, return -ENOMEM; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1334,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1412,6 +1416,13 @@ out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* @@ -1601,11 +1612,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = find_next_devid(root, &device->devid); if (ret) { + kfree(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + kfree(device->name); + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + lock_chunks(root); device->writeable = 1; @@ -1621,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { @@ -1873,7 +1892,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); lock_chunks(root); @@ -2047,7 +2066,7 @@ int btrfs_balance(struct btrfs_root *dev_root) BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_grow_device(trans, device, old_size); BUG_ON(ret); @@ -2213,6 +2232,11 @@ again: /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + lock_chunks(root); device->disk_total_bytes = new_size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a5776531dc2..d779cefcfd7 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -370,7 +370,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; @@ -378,7 +379,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, char *suffix; char *name; - err = security_inode_init_security(inode, dir, &suffix, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 7a43fd640bb..b3cc8039134 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); + struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif /* __XATTR__ */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 42c7fafc8bf..a0358c2189c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, bool preemptive) { struct dentry *grave, *trap; + struct path path, path_to_graveyard; char nbuffer[8 + 8 + 1]; int ret; @@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* non-directories can just be unlinked */ if (!S_ISDIR(rep->d_inode->i_mode)) { _debug("unlink stale object"); - ret = vfs_unlink(dir->d_inode, rep); - if (preemptive) - cachefiles_mark_object_buried(cache, rep); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_unlink(&path, rep); + if (ret < 0) { + cachefiles_io_error(cache, "Unlink security error"); + } else { + ret = vfs_unlink(dir->d_inode, rep); + + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } mutex_unlock(&dir->d_inode->i_mutex); @@ -379,12 +388,23 @@ try_again: } /* attempt the rename */ - ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); - if (ret != 0 && ret != -ENOMEM) - cachefiles_io_error(cache, "Rename failed with error %d", ret); + path.mnt = cache->mnt; + path.dentry = dir; + path_to_graveyard.mnt = cache->mnt; + path_to_graveyard.dentry = cache->graveyard; + ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + if (ret < 0) { + cachefiles_io_error(cache, "Rename security error %d", ret); + } else { + ret = vfs_rename(dir->d_inode, rep, + cache->graveyard->d_inode, grave); + if (ret != 0 && ret != -ENOMEM) + cachefiles_io_error(cache, + "Rename failed with error %d", ret); - if (preemptive) - cachefiles_mark_object_buried(cache, rep); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + } unlock_rename(cache->graveyard, dir); dput(grave); @@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, { struct cachefiles_cache *cache; struct dentry *dir, *next = NULL; + struct path path; unsigned long start; const char *name; int ret, nlen; @@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, cache = container_of(parent->fscache.cache, struct cachefiles_cache, cache); + path.mnt = cache->mnt; ASSERT(parent->dentry); ASSERT(parent->dentry->d_inode); @@ -511,6 +533,10 @@ lookup_again: if (ret < 0) goto create_error; + path.dentry = dir; + ret = security_path_mkdir(&path, next, 0); + if (ret < 0) + goto create_error; start = jiffies; ret = vfs_mkdir(dir->d_inode, next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); @@ -536,6 +562,10 @@ lookup_again: if (ret < 0) goto create_error; + path.dentry = dir; + ret = security_path_mknod(&path, next, S_IFREG, 0); + if (ret < 0) + goto create_error; start = jiffies; ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); cachefiles_hist(cachefiles_create_histogram, start); @@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, { struct dentry *subdir; unsigned long start; + struct path path; int ret; _enter(",,%s", dirname); @@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _debug("attempt mkdir"); + path.mnt = cache->mnt; + path.dentry = dir; + ret = security_path_mkdir(&path, subdir, 0700); + if (ret < 0) + goto mkdir_error; ret = vfs_mkdir(dir->d_inode, subdir, 0700); if (ret < 0) goto mkdir_error; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0bc68de8edd..ebafa65a29b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -409,7 +409,7 @@ more: spin_lock(&inode->i_lock); if (ci->i_release_count == fi->dir_release_count) { dout(" marking %p complete\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ ci->i_max_offset = filp->f_pos; } spin_unlock(&inode->i_lock); @@ -496,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, /* .snap dir? */ if (err == -ENOENT && + ceph_snap(parent) == CEPH_NOSNAP && strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); @@ -992,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; dir = dentry->d_parent->d_inode; @@ -1029,28 +1030,8 @@ out_touch: static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *parent_inode = NULL; - u64 snapid = CEPH_NOSNAP; - if (!IS_ROOT(dentry)) { - parent_inode = dentry->d_parent->d_inode; - if (parent_inode) - snapid = ceph_snap(parent_inode); - } - dout("dentry_release %p parent %p\n", dentry, parent_inode); - if (parent_inode && snapid != CEPH_SNAPDIR) { - struct ceph_inode_info *ci = ceph_inode(parent_inode); - - spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen || - snapid <= CEPH_MAXSNAP) { - dout(" clearing %p complete (d_release)\n", - parent_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } - spin_unlock(&parent_inode->i_lock); - } + dout("dentry_release %p\n", dentry); if (di) { ceph_dentry_lru_del(dentry); if (di->lease_session) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5625463aa47..193bfa5e9cb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode, (issued & CEPH_CAP_FILE_EXCL) == 0 && (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ ci->i_max_offset = 2; } break; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 39c243acd06..f40b9139e43 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (lastinode) iput(lastinode); - dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); - list_for_each_entry(child, &realm->children, child_item) - queue_realm_cap_snaps(child); + list_for_each_entry(child, &realm->children, child_item) { + dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", + realm, realm->ino, child, child->ino); + list_del_init(&child->dirty_item); + list_add(&child->dirty_item, &realm->dirty_item); + } + list_del_init(&realm->dirty_item); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -683,7 +687,9 @@ more: * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. */ - list_for_each_entry(realm, &dirty_realms, dirty_item) { + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); queue_realm_cap_snaps(realm); } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index ee45648b0d1..7cb0f7f847e 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -3,6 +3,7 @@ config CIFS depends on INET select NLS select CRYPTO + select CRYPTO_MD4 select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index f1c68629f27..0a265ad9e42 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) cFYI(1, "in %s", __func__); BUG_ON(IS_ROOT(mntpt)); - xid = GetXid(); - /* * The MSDFS spec states that paths in DFS referral requests and * responses must be prefixed by a single '\' character instead of @@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(-ENOMEM); full_path = build_path_from_dentry(mntpt); if (full_path == NULL) - goto free_xid; + goto cdda_exit; cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) } ses = tlink_tcon(tlink)->ses; + xid = GetXid(); rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, &num_referrals, &referrals, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + FreeXid(xid); cifs_put_tlink(tlink); @@ -338,8 +338,7 @@ success: free_dfs_info_array(referrals, num_referrals); free_full_path: kfree(full_path); -free_xid: - FreeXid(xid); +cdda_exit: cFYI(1, "leaving %s" , __func__); return mnt; } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1e7636b145a..beeebf19423 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); + if (!ppace) { + cERROR(1, "DACL memory allocation error"); + return; + } for (i = 0; i < num_aces; ++i) { ppace[i] = (struct cifs_ace *) (acl_base + acl_size); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 0db5f1de022..a51585f9852 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -657,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses) get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); - if (!tfm_arc4 || IS_ERR(tfm_arc4)) { + if (IS_ERR(tfm_arc4)) { + rc = PTR_ERR(tfm_arc4); cERROR(1, "could not allocate crypto API arc4\n"); - return PTR_ERR(tfm_arc4); + return rc; } desc.tfm = tfm_arc4; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 14789a97304..a9371b6578c 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.69" +#define CIFS_VERSION "1.71" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index edd5b29b53c..17afb0fbcae 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -188,6 +188,8 @@ struct TCP_Server_Info { /* multiplexed reads or writes */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ + /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */ + /* when socket is setup (and during reconnect) before NegProt sent */ unsigned int max_rw; /* maxRw specifies the maximum */ /* message size the server can send or receive for */ /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ @@ -652,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define MID_REQUEST_SUBMITTED 2 #define MID_RESPONSE_RECEIVED 4 #define MID_RETRY_NEEDED 8 /* session closed while this request out */ -#define MID_NO_RESP_NEEDED 0x10 +#define MID_RESPONSE_MALFORMED 0x10 /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3106f5e5c63..904aa47e351 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) } } - if (ses->status == CifsExiting) - return -EIO; - /* * Give demultiplex thread up to 10 seconds to reconnect, should be * greater than cifs socket timeout which is 7 seconds @@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) * retrying until process is killed or server comes * back on-line */ - if (!tcon->retry || ses->status == CifsExiting) { + if (!tcon->retry) { cFYI(1, "gave up waiting on reconnect in smb_init"); return -EHOSTDOWN; } @@ -4914,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, __u16 fid, __u32 pid_of_opener, bool SetAllocation) { struct smb_com_transaction2_sfi_req *pSMB = NULL; - char *data_offset; struct file_end_of_file_info *parm_data; int rc = 0; __u16 params, param_offset, offset, byte_count, count; @@ -4938,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; offset = param_offset + params; - data_offset = (char *) (&pSMB->hdr.Protocol) + offset; - count = sizeof(struct file_end_of_file_info); pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find exact max SMB PDU from sess structure BB */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 47d8ff62368..8d6c17ab593 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -337,8 +337,13 @@ cifs_echo_request(struct work_struct *work) struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - /* no need to ping if we got a response recently */ - if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + /* + * We cannot send an echo until the NEGOTIATE_PROTOCOL request is + * done, which is indicated by maxBuf != 0. Also, no need to ping if + * we got a response recently + */ + if (server->maxBuf == 0 || + time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) goto requeue_echo; rc = CIFSSMBEcho(server); @@ -578,14 +583,23 @@ incomplete_rcv: else if (reconnect == 1) continue; - length += 4; /* account for rfc1002 hdr */ + total_read += 4; /* account for rfc1002 hdr */ + dump_smb(smb_buffer, total_read); - dump_smb(smb_buffer, length); - if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { - cifs_dump_mem("Bad SMB: ", smb_buffer, 48); - continue; - } + /* + * We know that we received enough to get to the MID as we + * checked the pdu_length earlier. Now check to see + * if the rest of the header is OK. We borrow the length + * var for the rest of the loop to avoid a new stack var. + * + * 48 bytes is enough to display the header and a little bit + * into the payload for debugging purposes. + */ + length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); + if (length != 0) + cifs_dump_mem("Bad SMB: ", smb_buffer, + min_t(unsigned int, total_read, 48)); mid_entry = NULL; server->lstrp = jiffies; @@ -597,7 +611,8 @@ incomplete_rcv: if ((mid_entry->mid == smb_buffer->Mid) && (mid_entry->midState == MID_REQUEST_SUBMITTED) && (mid_entry->command == smb_buffer->Command)) { - if (check2ndT2(smb_buffer,server->maxBuf) > 0) { + if (length == 0 && + check2ndT2(smb_buffer, server->maxBuf) > 0) { /* We have a multipart transact2 resp */ isMultiRsp = true; if (mid_entry->resp_buf) { @@ -632,12 +647,17 @@ incomplete_rcv: mid_entry->resp_buf = smb_buffer; mid_entry->largeBuf = isLargeBuf; multi_t2_fnd: - mid_entry->midState = MID_RESPONSE_RECEIVED; - list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); + if (length == 0) + mid_entry->midState = + MID_RESPONSE_RECEIVED; + else + mid_entry->midState = + MID_RESPONSE_MALFORMED; #ifdef CONFIG_CIFS_STATS2 mid_entry->when_received = jiffies; #endif + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); break; } mid_entry = NULL; @@ -653,6 +673,9 @@ multi_t2_fnd: else smallbuf = NULL; } + } else if (length != 0) { + /* response sanity checks failed */ + continue; } else if (!is_valid_oplock_break(smb_buffer, server) && !isMultiRsp) { cERROR(1, "No task to wake, unknown frame received! " diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0de17c1db60..e964b1cd5dd 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file) struct cifsTconInfo *tcon; struct tcon_link *tlink; struct cifsFileInfo *pCifsFile = NULL; - struct cifsInodeInfo *pCifsInode; char *full_path = NULL; bool posix_open_ok = false; __u16 netfid; @@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file) } tcon = tlink_tcon(tlink); - pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { rc = -ENOMEM; @@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) char *write_data; int rc = -EFAULT; int bytes_written = 0; - struct cifs_sb_info *cifs_sb; struct inode *inode; struct cifsFileInfo *open_file; @@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return -EFAULT; inode = page->mapping->host; - cifs_sb = CIFS_SB(inode->i_sb); offset += (loff_t)from; write_data = kmap(page); @@ -1667,9 +1662,10 @@ static ssize_t cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { - size_t total_written = 0, written = 0; - unsigned long num_pages, npages; - size_t copied, len, cur_len, i; + unsigned int written; + unsigned long num_pages, npages, i; + size_t copied, len, cur_len; + ssize_t total_written = 0; struct kvec *to_send; struct page **pages; struct iov_iter it; @@ -1825,7 +1821,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, { int rc; int xid; - unsigned int total_read, bytes_read = 0; + ssize_t total_read; + unsigned int bytes_read = 0; size_t len, cur_len; int iov_offset = 0; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 02cd60aefbf..e8804d37340 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(md5)) { + rc = PTR_ERR(md5); cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); - return PTR_ERR(md5); + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); sdescmd5 = kmalloc(size, GFP_KERNEL); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index a09e077ba92..2a930a752a7 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server) { __u16 mid = 0; __u16 last_mid; - int collision; - - if (server == NULL) - return mid; + bool collision; spin_lock(&GlobalMid_Lock); last_mid = server->CurrentMid; /* we do not want to loop forever */ @@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server) (and it would also have to have been a request that did not time out) */ while (server->CurrentMid != last_mid) { - struct list_head *tmp; struct mid_q_entry *mid_entry; + unsigned int num_mids; - collision = 0; + collision = false; if (server->CurrentMid == 0) server->CurrentMid++; - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - - if ((mid_entry->mid == server->CurrentMid) && - (mid_entry->midState == MID_REQUEST_SUBMITTED)) { + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == server->CurrentMid && + mid_entry->midState == MID_REQUEST_SUBMITTED) { /* This mid is in use, try a different one */ - collision = 1; + collision = true; break; } } - if (collision == 0) { + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { mid = server->CurrentMid; break; } @@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , } static int -checkSMBhdr(struct smb_hdr *smb, __u16 mid) +check_smb_hdr(struct smb_hdr *smb, __u16 mid) { - /* Make sure that this really is an SMB, that it is a response, - and that the message ids match */ - if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && - (mid == smb->Mid)) { - if (smb->Flags & SMBFLG_RESPONSE) - return 0; - else { - /* only one valid case where server sends us request */ - if (smb->Command == SMB_COM_LOCKING_ANDX) - return 0; - else - cERROR(1, "Received Request not response"); - } - } else { /* bad signature or mid */ - if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) - cERROR(1, "Bad protocol string signature header %x", - *(unsigned int *) smb->Protocol); - if (mid != smb->Mid) - cERROR(1, "Mids do not match"); + /* does it have the right SMB "signature" ? */ + if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { + cERROR(1, "Bad protocol string signature header 0x%x", + *(unsigned int *)smb->Protocol); + return 1; + } + + /* Make sure that message ids match */ + if (mid != smb->Mid) { + cERROR(1, "Mids do not match. received=%u expected=%u", + smb->Mid, mid); + return 1; } - cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); + + /* if it's a response then accept */ + if (smb->Flags & SMBFLG_RESPONSE) + return 0; + + /* only one valid case where server sends us request */ + if (smb->Command == SMB_COM_LOCKING_ANDX) + return 0; + + cERROR(1, "Server sent request, not response. mid=%u", smb->Mid); return 1; } @@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) return 1; } - if (checkSMBhdr(smb, mid)) + if (check_smb_hdr(smb, mid)) return 1; clc_len = smbCalcSize_LE(smb); @@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } - cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", + cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", clc_len, 4 + len, smb->Mid); - /* Windows XP can return a few bytes too much, presumably - an illegal pad, at the end of byte range lock responses - so we allow for that three byte pad, as long as actual - received length is as long or longer than calculated length */ - /* We have now had to extend this more, since there is a - case in which it needs to be bigger still to handle a - malformed response to transact2 findfirst from WinXP when - access denied is returned and thus bcc and wct are zero - but server says length is 0x21 bytes too long as if the server - forget to reset the smb rfc1001 length when it reset the - wct and bcc to minimum size and drop the t2 parms and data */ - if ((4+len > clc_len) && (len <= clc_len + 512)) - return 0; - else { - cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d", + + if (4 + len < clc_len) { + cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", len, smb->Mid); return 1; + } else if (len > clc_len + 512) { + /* + * Some servers (Windows XP in particular) send more + * data than the lengths in the SMB packet would + * indicate on certain calls (byte range locks and + * trans2 find first calls in particular). While the + * client can handle such a frame by ignoring the + * trailing data, we choose limit the amount of extra + * data to 512 bytes. + */ + cERROR(1, "RFC1001 size %u more than 512 bytes larger " + "than SMB for mid=%u", len, smb->Mid); + return 1; } } return 0; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 8d9189f6447..79f641eeda3 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) { int rc, alen, slen; const char *pct; - char *endp, scope_id[13]; + char scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; @@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) memcpy(scope_id, pct + 1, slen); scope_id[slen] = '\0'; - s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); - if (endp != scope_id + slen) - return 0; + rc = strict_strtoul(scope_id, 0, + (unsigned long *)&s6->sin6_scope_id); + rc = (rc == 0) ? 1 : 0; } return rc; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 7f25cc3d225..f8e4cd2a791 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) { int rc = 0; int xid, i; - struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; struct cifsFileInfo *cifsFile = NULL; char *current_entry; @@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) xid = GetXid(); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 1adc9625a34..16765703131 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate: if (type == LANMAN) { #ifdef CONFIG_CIFS_WEAK_PW_HASH - char lnm_session_key[CIFS_SESS_KEY_SIZE]; + char lnm_session_key[CIFS_AUTH_RESP_SIZE]; pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; /* no capabilities flags in old lanman negotiation */ - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* Calculate hash with password and copy into bcc_ptr. * Encryption Key (stored as in cryptkey) gets used if the @@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate: true : false, lnm_session_key); ses->flags |= CIFS_SES_LANMAN; - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE); - bcc_ptr += CIFS_SESS_KEY_SIZE; + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; /* can not sign if LANMAN negotiated so no need to calculate signing key? but what if server diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index b5450e9f40c..b5041c84998 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) md4 = crypto_alloc_shash("md4", 0, 0); if (IS_ERR(md4)) { + rc = PTR_ERR(md4); cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc); - return PTR_ERR(md4); + return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); sdescmd4 = kmalloc(size, GFP_KERNEL); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c1ccca1a933..46d8756f2b2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) server->tcpStatus = CifsNeedReconnect; } - if (rc < 0) { + if (rc < 0 && rc != -EINTR) cERROR(1, "Error %d sending data on socket to server", rc); - } else + else rc = 0; /* Don't want to modify the buffer as a @@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf, if (rc) return rc; + /* enable signing if server requires it */ + if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + mutex_lock(&server->srv_mutex); mid = AllocMidQEntry(in_buf, server); if (mid == NULL) { @@ -453,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) case MID_RETRY_NEEDED: rc = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + rc = -EIO; + break; default: cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, mid->mid, mid->midState); @@ -570,17 +577,33 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, #endif mutex_unlock(&ses->server->srv_mutex); - cifs_small_buf_release(in_buf); - if (rc < 0) + if (rc < 0) { + cifs_small_buf_release(in_buf); goto out; + } - if (long_op == CIFS_ASYNC_OP) + if (long_op == CIFS_ASYNC_OP) { + cifs_small_buf_release(in_buf); goto out; + } rc = wait_for_response(ses->server, midQ); - if (rc != 0) - goto out; + if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + cifs_small_buf_release(in_buf); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } + + cifs_small_buf_release(in_buf); rc = sync_mid_result(midQ, ses->server); if (rc != 0) { @@ -724,8 +747,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, goto out; rc = wait_for_response(ses->server, midQ); - if (rc != 0) - goto out; + if (rc != 0) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + atomic_dec(&ses->server->inFlight); + wake_up(&ses->server->request_q); + return rc; + } + spin_unlock(&GlobalMid_Lock); + } rc = sync_mid_result(midQ, ses->server); if (rc != 0) { @@ -922,10 +956,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, } } - if (wait_for_response(ses->server, midQ) == 0) { - /* We got the response - restart system call. */ - rstart = 1; + rc = wait_for_response(ses->server, midQ); + if (rc) { + send_nt_cancel(ses->server, in_buf, midQ); + spin_lock(&GlobalMid_Lock); + if (midQ->midState == MID_REQUEST_SUBMITTED) { + /* no longer considered to be "in-flight" */ + midQ->callback = DeleteMidQEntry; + spin_unlock(&GlobalMid_Lock); + return rc; + } + spin_unlock(&GlobalMid_Lock); } + + /* We got the response - restart system call. */ + rstart = 1; } rc = sync_mid_result(midQ, ses->server); diff --git a/fs/compat.c b/fs/compat.c index f6fd0a00e6c..c6d31a3bab8 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - path_put(&path); - } + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) { - struct file * file; struct kstatfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - fput(file); -out: return error; } @@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct path path; + struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - path_put(&path); - } + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct file * file; struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - fput(file); -out: return error; } @@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -2308,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd, } #endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/dcache.c b/fs/dcache.c index 2a6bd9a4ae9..a39fe47c466 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(parent->d_lock) __releases(dentry->d_inode->i_lock) { - dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb) } /* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + +/* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs * list is non-empty and continue searching. @@ -1066,24 +1099,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1181,24 +1200,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + __dget(alias); + return alias; +} + +static struct dentry * d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} + + /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode) if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_alias(inode); + res = d_find_any_alias(inode); if (res) goto out_iput; @@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_lock(&inode->i_lock); - res = __d_find_alias(inode, 0); + res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(tmp); @@ -2920,28 +2947,14 @@ resume: spin_unlock(&dentry->d_lock); } if (this_parent != root) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; + struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; this_parent->d_count--; } - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9c64ae9e4c1..2d8c87b951c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1468,15 +1468,13 @@ static void work_stop(void) static int work_start(void) { - recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + recv_workqueue = create_singlethread_workqueue("dlm_recv"); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + send_workqueue = create_singlethread_workqueue("dlm_send"); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 6fc4f319b55..534c1d46e69 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct dentry *lower_dentry; struct vfsmount *lower_mnt; - struct dentry *dentry_save; - struct vfsmount *vfsmount_save; + struct dentry *dentry_save = NULL; + struct vfsmount *vfsmount_save = NULL; int rc = 1; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + } rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + } if (dentry->d_inode) { struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index dbc84ed9633..e00753496e3 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry, u32 flags); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct inode *ecryptfs_dir_inode, - struct nameidata *ecryptfs_nd); + struct inode *ecryptfs_dir_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, struct dentry *ecryptfs_dentry, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 81e10e6a944..7d1050e254f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, + .read = generic_read_dir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index bd33f87a190..b592938a84b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, unsigned int flags_save; int rc; - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - flags_save = nd->flags; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; - nd->flags &= ~LOOKUP_OPEN; + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + flags_save = nd->flags; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + nd->flags &= ~LOOKUP_OPEN; + } rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; - nd->flags = flags_save; + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + nd->flags = flags_save; + } return rc; } @@ -241,8 +245,7 @@ out: */ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct inode *ecryptfs_dir_inode, - struct nameidata *ecryptfs_nd) + struct inode *ecryptfs_dir_inode) { struct dentry *lower_dir_dentry; struct vfsmount *lower_mnt; @@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out; if (special_file(lower_inode->i_mode)) goto out; - if (!ecryptfs_nd) - goto out; /* Released in this function */ page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); if (!page_virt) { @@ -349,75 +350,6 @@ out: } /** - * ecryptfs_new_lower_dentry - * @name: The name of the new dentry. - * @lower_dir_dentry: Parent directory of the new dentry. - * @nd: nameidata from last lookup. - * - * Create a new dentry or get it from lower parent dir. - */ -static struct dentry * -ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry, - struct nameidata *nd) -{ - struct dentry *new_dentry; - struct dentry *tmp; - struct inode *lower_dir_inode; - - lower_dir_inode = lower_dir_dentry->d_inode; - - tmp = d_alloc(lower_dir_dentry, name); - if (!tmp) - return ERR_PTR(-ENOMEM); - - mutex_lock(&lower_dir_inode->i_mutex); - new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd); - mutex_unlock(&lower_dir_inode->i_mutex); - - if (!new_dentry) - new_dentry = tmp; - else - dput(tmp); - - return new_dentry; -} - - -/** - * ecryptfs_lookup_one_lower - * @ecryptfs_dentry: The eCryptfs dentry that we are looking up - * @lower_dir_dentry: lower parent directory - * @name: lower file name - * - * Get the lower dentry from vfs. If lower dentry does not exist yet, - * create it. - */ -static struct dentry * -ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, - struct dentry *lower_dir_dentry, struct qstr *name) -{ - struct nameidata nd; - struct vfsmount *lower_mnt; - int err; - - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( - ecryptfs_dentry->d_parent)); - err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); - mntput(lower_mnt); - - if (!err) { - /* we dont need the mount */ - mntput(nd.path.mnt); - return nd.path.dentry; - } - if (err != -ENOENT) - return ERR_PTR(err); - - /* create a new lower dentry */ - return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd); -} - -/** * ecryptfs_lookup * @ecryptfs_dir_inode: The eCryptfs directory inode * @ecryptfs_dentry: The eCryptfs dentry that we are looking up @@ -434,7 +366,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, size_t encrypted_and_encoded_name_size; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; struct dentry *lower_dir_dentry, *lower_dentry; - struct qstr lower_name; int rc = 0; if ((ecryptfs_dentry->d_name.len == 1 @@ -444,20 +375,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - lower_name.name = ecryptfs_dentry->d_name.name; - lower_name.len = ecryptfs_dentry->d_name.len; - lower_name.hash = ecryptfs_dentry->d_name.hash; - if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { - rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - lower_dir_dentry->d_inode, &lower_name); - if (rc < 0) - goto out_d_drop; - } - lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry, &lower_name); + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); goto out_d_drop; @@ -479,28 +404,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } - lower_name.name = encrypted_and_encoded_name; - lower_name.len = encrypted_and_encoded_name_size; - lower_name.hash = full_name_hash(lower_name.name, lower_name.len); - if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { - rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, - lower_dir_dentry->d_inode, &lower_name); - if (rc < 0) - goto out_d_drop; - } - lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry, &lower_name); + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); goto out_d_drop; } lookup_and_interpose: rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - ecryptfs_dir_inode, - ecryptfs_nd); + ecryptfs_dir_inode); goto out; out_d_drop: d_drop(ecryptfs_dentry); @@ -1092,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), ecryptfs_dentry_to_lower(dentry), &lower_stat); if (!rc) { + fsstack_copy_attr_all(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)); generic_fillattr(dentry->d_inode, stat); stat->blocks = lower_stat.blocks; } diff --git a/fs/eventfd.c b/fs/eventfd.c index e0194b3e14d..d9a59177391 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get); * @ctx: [in] Pointer to eventfd context. * * The eventfd context reference must have been previously acquired either - * with eventfd_ctx_get() or eventfd_ctx_fdget()). + * with eventfd_ctx_get() or eventfd_ctx_fdget(). */ void eventfd_ctx_put(struct eventfd_ctx *ctx) { @@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. * @ctx: [in] Pointer to eventfd context. * @wait: [in] Wait queue to be removed. - * @cnt: [out] Pointer to the 64bit conter value. + * @cnt: [out] Pointer to the 64-bit counter value. * - * Returns zero if successful, or the following error codes: + * Returns %0 if successful, or the following error codes: * * -EAGAIN : The operation would have blocked. * @@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. * @ctx: [in] Pointer to eventfd context. * @no_wait: [in] Different from zero if the operation should not block. - * @cnt: [out] Pointer to the 64bit conter value. + * @cnt: [out] Pointer to the 64-bit counter value. * - * Returns zero if successful, or the following error codes: + * Returns %0 if successful, or the following error codes: * - * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -EAGAIN : The operation would have blocked but @no_wait was non-zero. * -ERESTARTSYS : A signal interrupted the wait operation. * * If @no_wait is zero, the function might sleep until the eventfd internal diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cc8a9b7d606..4a09af9e9a6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -63,6 +63,13 @@ * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -224,6 +231,9 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + /* Used for safe wake up implementation */ static struct nested_calls poll_safewake_ncalls; @@ -1114,6 +1124,17 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed); } +static inline struct timespec ep_set_mstimeout(long ms) +{ + struct timespec now, ts = { + .tv_sec = ms / MSEC_PER_SEC, + .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), + }; + + ktime_get_ts(&now); + return timespec_add_safe(now, ts); +} + static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { @@ -1121,12 +1142,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, unsigned long flags; long slack; wait_queue_t wait; - struct timespec end_time; ktime_t expires, *to = NULL; if (timeout > 0) { - ktime_get_ts(&end_time); - timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); + struct timespec end_time = ep_set_mstimeout(timeout); + slack = select_estimate_accuracy(&end_time); to = &expires; *to = timespec_to_ktime(end_time); @@ -1188,6 +1208,62 @@ retry: return res; } +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + epi->ffd.file->private_data, current); + if (error != 0) + break; + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); +} + /* * Open an eventpoll file descriptor. */ @@ -1236,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; + int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; @@ -1277,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ ep = file->private_data; + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. + * + * We hold epmutex across the loop check and the insert in this case, in + * order to prevent two separate inserts from racing and each doing the + * insert "at the same time" such that ep_loop_check passes on both + * before either one does the insert, thereby creating a cycle. + */ + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } + + mutex_lock(&ep->mtx); /* @@ -1312,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: + if (unlikely(did_lock_epmutex)) + mutex_unlock(&epmutex); + fput(tfile); error_fput: fput(file); @@ -1431,6 +1530,12 @@ static int __init eventpoll_init(void) EP_ITEM_COST; BUG_ON(max_user_watches < 0); + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); diff --git a/fs/exec.c b/fs/exec.c index c62efcb959c..ba99e1abb1a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct file *file; char *tmp = getname(library); int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -721,10 +724,13 @@ struct file *open_exec(const char *name) { struct file *file; int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; - file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0, - MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); if (IS_ERR(file)) goto out; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 42685424817..a7555238c41 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1030,7 +1030,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } - inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1131,7 +1130,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; - inode->i_mapping->backing_dev_info = sb->s_bdi; sb->s_dirt = 1; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 264e95d0283..4d70db110cf 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = exofs_find_entry(new_dir, new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); err = exofs_set_link(new_dir, new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; if (dir_de) @@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= EXOFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = exofs_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = CURRENT_TIME; exofs_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4b6825740dd..b05acb79613 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - - if (len < 2 || (connectable && len < 4)) + + if (connectable && (len < 4)) { + *max_len = 4; + return 255; + } else if (len < 2) { + *max_len = 2; return 255; + } len = 2; fid->i32.ino = inode->i_ino; @@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, /* * Try to get any dentry for the given file handle from the filesystem. */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (!result) result = ERR_PTR(-ESTALE); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 6346a2acf32..1b48c337087 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); /* ialloc.c */ -extern struct inode * ext2_new_inode (struct inode *, int); +extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); extern void ext2_check_inodes_bitmap (struct super_block *); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index ad70479aabf..ee9ed31948e 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -429,7 +429,8 @@ found: return group; } -struct inode *ext2_new_inode(struct inode *dir, int mode) +struct inode *ext2_new_inode(struct inode *dir, int mode, + const struct qstr *qstr) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -585,7 +586,7 @@ got: if (err) goto fail_free_drop; - err = ext2_init_security(inode,dir); + err = ext2_init_security(inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 2e1d8341d82..ed5c5d496ee 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st dquot_initialize(dir); - inode = ext2_new_inode(dir, mode); + inode = ext2_new_inode(dir, mode, &dentry->d_name); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ dquot_initialize(dir); - inode = ext2_new_inode (dir, mode); + inode = ext2_new_inode (dir, mode, &dentry->d_name); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, dquot_initialize(dir); - inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; @@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) inode_inc_link_count(dir); - inode = ext2_new_inode (dir, S_IFDIR | mode); + inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_dir; @@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); ext2_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (new_dir->i_nlink >= EXT2_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = ext2_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); ext2_delete_entry (old_de, old_page); - inode_dec_link_count(old_inode); if (dir_de) { if (old_dir != new_dir) diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index a1a1c218461..5e41cccff76 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -116,9 +116,11 @@ exit_ext2_xattr(void) # endif /* CONFIG_EXT2_FS_XATTR */ #ifdef CONFIG_EXT2_FS_SECURITY -extern int ext2_init_security(struct inode *inode, struct inode *dir); +extern int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); #else -static inline int ext2_init_security(struct inode *inode, struct inode *dir) +static inline int ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { return 0; } diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 3004e15d5da..5d979b4347b 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, } int -ext2_init_security(struct inode *inode, struct inode *dir) +ext2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 9724aef2246..bfc2dc43681 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, + const struct qstr *qstr, int mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -589,7 +590,7 @@ got: if (err) goto fail_free_drop; - err = ext3_init_security(handle,inode, dir); + err = ext3_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b27ba71810e..0521a007ae6 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1710,7 +1710,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; @@ -1746,7 +1746,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1784,7 +1784,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2206,7 +2206,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 85c8cc8f247..9cc19a1dea8 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 377fe720116..2be4f69bfa6 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -128,10 +128,10 @@ exit_ext3_xattr(void) #ifdef CONFIG_EXT3_FS_SECURITY extern int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext3_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 03a99bfc59f..b8d9f83aa5c 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, } int -ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c8d97b56f3..3aa0b72b3b9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -848,6 +848,7 @@ struct ext4_inode_info { atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 63a75810b7c..ccce8a7e94e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * that this IO needs to convertion to written when IO is * completed */ - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) map->m_flags |= EXT4_MAP_UNINIT; @@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * that we need to perform convertion when IO is done. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) + if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { io->flag = EXT4_IO_END_UNWRITTEN; - else + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2e8322c8aa8..7b80d543b89 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } +static void ext4_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + size_t count = iov_length(iov, nr_segs); + loff_t final_size = pos + count; + + if (pos >= inode->i_size) + return 0; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + + return 0; +} + static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int unaligned_aio = 0; + int ret; /* * If we have encountered a bitmap-format file, the size limit @@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, nr_segs = iov_shorten((struct iovec *)iov, nr_segs, sbi->s_bitmap_maxbytes - pos); } + } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && + !is_sync_kiocb(iocb))) { + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); } - return generic_file_aio_write(iocb, iov, nr_segs, pos); + /* Unaligned direct AIO must be serialized; see comment above */ + if (unaligned_aio) { + static unsigned long unaligned_warn_time; + + /* Warn about this once per day */ + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) + ext4_msg(inode->i_sb, KERN_WARNING, + "Unaligned AIO/DIO on inode %ld by %s; " + "performance will be poor.", + inode->i_ino, current->comm); + mutex_lock(ext4_aio_mutex(inode)); + ext4_aiodio_wait(inode); + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if (unaligned_aio) + mutex_unlock(ext4_aio_mutex(inode)); + + return ret; } static const struct vm_operations_struct ext4_file_vm_ops = { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index eb9097aec6f..78b79e1bd7e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1042,7 +1042,7 @@ got: if (err) goto fail_free_drop; - err = ext4_init_security(handle, inode, dir); + err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 851f49b2f9d..d1fe09aea73 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ -#define NR_GRPINFO_CACHES \ - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1) +#define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, @@ -2414,6 +2419,55 @@ err_freesgi: return -ENOMEM; } +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + ext4_groupinfo_caches[cache_index] = cachep; + + return 0; +} + int ext4_mb_init(struct super_block *sb, int needs_recovery) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned offset; unsigned max; int ret; - int cache_index; - struct kmem_cache *cachep; - char *namep = NULL; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); @@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) goto out; } - cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; - cachep = ext4_groupinfo_caches[cache_index]; - if (!cachep) { - char name[32]; - int len = offsetof(struct ext4_group_info, - bb_counters[sb->s_blocksize_bits + 2]); - - sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits); - namep = kstrdup(name, GFP_KERNEL); - if (!namep) { - ret = -ENOMEM; - goto out; - } - - /* Need to free the kmem_cache_name() when we - * destroy the slab */ - cachep = kmem_cache_create(namep, len, 0, - SLAB_RECLAIM_ACCOUNT, NULL); - if (!cachep) { - ret = -ENOMEM; - goto out; - } - ext4_groupinfo_caches[cache_index] = cachep; - } + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; @@ -2520,7 +2550,6 @@ out: if (ret) { kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - kfree(namep); } return ret; } @@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void) void ext4_exit_mballoc(void) { - int i; /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. @@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); - - for (i = 0; i < NR_GRPINFO_CACHES; i++) { - struct kmem_cache *cachep = ext4_groupinfo_caches[i]; - if (cachep) { - char *name = (char *)kmem_cache_name(cachep); - kmem_cache_destroy(cachep); - kfree(name); - } - } + ext4_groupinfo_destroy_slabs(); ext4_remove_debugfs_entry(); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5485390d32c..e781b7ea563 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7270dcfca92..955cc309142 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,14 +32,8 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; -#define WQ_HASH_SZ 37 -#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) -static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; - int __init ext4_init_pageio(void) { - int i; - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -48,9 +42,6 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } - for (i = 0; i < WQ_HASH_SZ; i++) - init_waitqueue_head(&ioend_wq[i]); - return 0; } @@ -62,7 +53,7 @@ void ext4_exit_pageio(void) void ext4_ioend_wait(struct inode *inode) { - wait_queue_head_t *wq = to_ioend_wq(inode); + wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); } @@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io) for (i = 0; i < io->num_io_pages; i++) put_io_page(io->pages[i]); io->num_io_pages = 0; - wq = to_ioend_wq(io->inode); + wq = ext4_ioend_wq(io->inode); if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && waitqueue_active(wq)) wake_up_all(wq); @@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io) struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + wait_queue_head_t *wq; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," @@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io) if (io->iocb) aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ - io->flag &= ~EXT4_IO_END_UNWRITTEN; + if (io->flag & EXT4_IO_END_UNWRITTEN) { + io->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + wq = ext4_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) && + waitqueue_active(wq)) { + wake_up_all(wq); + } + } + return ret; } @@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error) struct inode *inode; unsigned long flags; int i; + sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); bio->bi_private = NULL; @@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) SetPageError(page); BUG_ON(!head); - if (head->b_size == PAGE_CACHE_SIZE) - clear_buffer_dirty(head); - else { + if (head->b_size != PAGE_CACHE_SIZE) { loff_t offset; loff_t io_end_offset = io_end->offset + io_end->size; @@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) buffer_io_error(bh); - clear_buffer_dirty(bh); } if (buffer_delay(bh)) partial_write = 1; @@ -257,7 +256,7 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) - bio->bi_sector >> (inode->i_blkbits - 9)); + bi_sector >> (inode->i_blkbits - 9)); } /* Add the io_end to per-inode completed io list*/ @@ -380,6 +379,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, blocksize = 1 << inode->i_blkbits; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); set_page_writeback(page); ClearPageError(page); @@ -397,12 +397,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; if (block_start >= len) { clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; } + clear_buffer_dirty(bh); ret = io_submit_add_bh(io, io_page, inode, wbc, bh); if (ret) { /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 48ce561fafa..203f9e4a70b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext3_fs_type = { @@ -832,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); + atomic_set(&ei->i_aiodio_unwritten, 0); return &ei->vfs_inode; } @@ -2716,6 +2718,8 @@ static void ext4_unregister_li_request(struct super_block *sb) mutex_unlock(&ext4_li_info->li_list_mtx); } +static struct task_struct *ext4_lazyinit_task; + /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. @@ -2784,6 +2788,10 @@ cont_thread: if (time_before(jiffies, next_wakeup)) schedule(); finish_wait(&eli->li_wait_daemon, &wait); + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } } exit_thread: @@ -2808,6 +2816,7 @@ exit_thread: wake_up(&eli->li_wait_task); kfree(ext4_li_info); + ext4_lazyinit_task = NULL; ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); @@ -2830,11 +2839,10 @@ static void ext4_clear_request_list(void) static int ext4_run_lazyinit_thread(void) { - struct task_struct *t; - - t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); - if (IS_ERR(t)) { - int err = PTR_ERR(t); + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); del_timer_sync(&ext4_li_info->li_timer); kfree(ext4_li_info); @@ -2985,16 +2993,10 @@ static void ext4_destroy_lazyinit_thread(void) * If thread exited earlier * there's nothing to be done. */ - if (!ext4_li_info) + if (!ext4_li_info || !ext4_lazyinit_task) return; - ext4_clear_request_list(); - - while (ext4_li_info->li_task) { - wake_up(&ext4_li_info->li_wait_daemon); - wait_event(ext4_li_info->li_wait_task, - ext4_li_info->li_task == NULL); - } + kthread_stop(ext4_lazyinit_task); } static int ext4_fill_super(struct super_block *sb, void *data, int silent) @@ -3413,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); @@ -3507,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); no_journal: - EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->dio_unwritten_wq = + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); goto failed_mount_wq; @@ -4768,7 +4777,7 @@ static struct file_system_type ext4_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -int __init ext4_init_feat_adverts(void) +static int __init ext4_init_feat_adverts(void) { struct ext4_features *ef; int ret = -ENOMEM; @@ -4792,23 +4801,44 @@ out: return ret; } +static void ext4_exit_feat_adverts(void) +{ + kobject_put(&ext4_feat->f_kobj); + wait_for_completion(&ext4_feat->f_kobj_unregister); + kfree(ext4_feat); +} + +/* Shared across all ext4 file systems */ +wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + static int __init ext4_init_fs(void) { - int err; + int i, err; ext4_check_flag_values(); + + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { + mutex_init(&ext4__aio_mutex[i]); + init_waitqueue_head(&ext4__ioend_wq[i]); + } + err = ext4_init_pageio(); if (err) return err; err = ext4_init_system_zone(); if (err) - goto out5; + goto out7; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - goto out4; + goto out6; ext4_proc_root = proc_mkdir("fs/ext4", NULL); + if (!ext4_proc_root) + goto out5; err = ext4_init_feat_adverts(); + if (err) + goto out4; err = ext4_init_mballoc(); if (err) @@ -4838,12 +4868,14 @@ out1: out2: ext4_exit_mballoc(); out3: - kfree(ext4_feat); + ext4_exit_feat_adverts(); +out4: remove_proc_entry("fs/ext4", NULL); +out5: kset_unregister(ext4_kset); -out4: +out6: ext4_exit_system_zone(); -out5: +out7: ext4_exit_pageio(); return err; } @@ -4857,6 +4889,7 @@ static void __exit ext4_exit_fs(void) destroy_inodecache(); ext4_exit_xattr(); ext4_exit_mballoc(); + ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); ext4_exit_system_zone(); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 1ef16520b95..25b7387ff18 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir); + struct inode *dir, const struct qstr *qstr); #else static inline int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 9b21268e121..007c3bfbf09 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, } int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(inode, dir, &name, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 86753fe10bd..0e277ec4b61 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; - if (len < 5) + if (len < 5) { + *lenp = 5; return 255; /* no room */ + } ipos_h = MSDOS_I(inode)->i_pos >> 8; ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index f88f752babd..adae3fb7451 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* This is not negative dentry. Always valid. */ @@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* diff --git a/fs/fcntl.c b/fs/fcntl.c index ecc8b3954ed..6c82e5bac03 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; - struct file *file = fget(fildes); + struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd(); @@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; long err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, long err; err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -808,14 +835,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - FMODE_EXEC + __FMODE_EXEC | O_PATH )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 00000000000..bf93ad2bee0 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,265 @@ +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/exportfs.h> +#include <linux/fs_struct.h> +#include <linux/fsnotify.h> +#include <asm/uaccess.h> +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/file_table.c b/fs/file_table.c index c3e89adf53c..01e4c1e8e6b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -125,13 +125,13 @@ struct file *get_empty_filp(void) goto fail; percpu_counter_inc(&nr_files); + f->f_cred = get_cred(cred); if (security_file_alloc(f)) goto fail_sec; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); - f->f_cred = get_cred(cred); spin_lock_init(&f->f_lock); eventpoll_init_file(f); /* f->f_version: 0 */ @@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode, file_take_write(file); WARN_ON(mnt_clone_write(path->mnt)); } - ima_counts_get(file); + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); @@ -246,11 +247,15 @@ static void __fput(struct file *file) file->f_op->release(inode, file); security_file_free(file); ima_file_free(file); - if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) + if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && + !(file->f_mode & FMODE_PATH))) { cdev_put(inode->i_cdev); + } fops_put(file->f_op); put_pid(file->f_owner.pid); file_sb_list_del(file); + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_dec(inode); if (file->f_mode & FMODE_WRITE) drop_file_write_access(file); file->f_path.dentry = NULL; @@ -276,11 +281,10 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; - } + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); @@ -289,6 +293,25 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed) *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); } else { rcu_read_lock(); file = fcheck_files(files, fd); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bfed8447ed8..8bd0ef9286c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { struct inode *inode; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = entry->d_inode; @@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (err) return err; - if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) - return 0; + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } if (attr->ia_valid & ATTR_SIZE) is_truncate = true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 95da1bc1c82..9e0832dbb1e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } kfree(ff); } } @@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode) * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. */ - fuse_file_put(ff); + fuse_file_put(ff, ff->fc->destroy_req != NULL); } static int fuse_open(struct inode *inode, struct file *file) @@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) page_cache_release(page); } if (req->ff) - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ae5744a2f9e..d4286947bc2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/poll.h> +#include <linux/workqueue.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -262,7 +263,10 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - struct fuse_release_in in; + union { + struct fuse_release_in in; + struct work_struct work; + }; struct path path; } release; struct fuse_init_in init_in; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9e3f68cc1bd..051b1a08452 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, u64 nodeid; u32 generation; - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 7118f1a780a..cbc07155b1a 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) struct posix_acl *acl; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; + if (flags & IPERM_FLAG_RCU) { + if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) + return -ECHILD; + return -EAGAIN; + } acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); if (IS_ERR(acl)) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4f36f8832b9..aad77e4f61b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -695,6 +695,7 @@ out: if (error == 0) return 0; + unlock_page(page); page_cache_release(page); gfs2_trans_end(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3c4039d5eef..ef3dc4b9fae 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -21,6 +21,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "dir.h" #include "util.h" @@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrp_list rlist; u64 bn, bstart; - u32 blen; + u32 blen, btotal; __be64 *p; unsigned int rg_blocks = 0; int metadata; @@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, bstart = 0; blen = 0; + btotal = 0; for (p = top; p < bottom; p++) { if (!*p) @@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else { if (bstart) { if (metadata) - gfs2_free_meta(ip, bstart, blen); + __gfs2_free_meta(ip, bstart, blen); else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; } bstart = bn; @@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, } if (bstart) { if (metadata) - gfs2_free_meta(ip, bstart, blen); + __gfs2_free_meta(ip, bstart, blen); else - gfs2_free_data(ip, bstart, blen); + __gfs2_free_data(ip, bstart, blen); + + btotal += blen; } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4a456338b87..0da8da2c991 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) int error; int had_lock = 0; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9023db8184f..b5a5e60df0d 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7cfdcb91336..4074b952b05 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - if (!(file->f_flags & O_NOATIME)) { + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); error = gfs2_glock_nq(&i_gh); - file_accessed(file); - if (error == 0) - gfs2_glock_dq_uninit(&i_gh); + if (error == 0) { + file_accessed(file); + gfs2_glock_dq(&i_gh); + } + gfs2_holder_uninit(&i_gh); + if (error) + return error; } vma->vm_ops = &gfs2_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; @@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from, { struct gfs2_inode *ip = GFS2_I(page->mapping->host); - page_zero_new_buffers(page, from, to); - flush_dcache_page(page); + zero_user(page, from, to-from); mark_page_accessed(page); if (!gfs2_is_writeback(ip)) @@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from, block_commit_write(page, from, to); } -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +static int needs_empty_write(sector_t block, struct inode *inode) { - unsigned start, end, next; - struct buffer_head *bh, *head; int error; + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; - if (!page_has_buffers(page)) { - error = __block_write_begin(page, from, to - from, gfs2_block_map); - if (unlikely(error)) - return error; + bh_map.b_size = 1 << inode->i_blkbits; + error = gfs2_block_map(inode, block, &bh_map, 0); + if (unlikely(error)) + return error; + return !buffer_mapped(&bh_map); +} - empty_write_end(page, from, to); - return 0; - } +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + unsigned start, end, next, blksize; + sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + int ret; - bh = head = page_buffers(page); + blksize = 1 << inode->i_blkbits; next = end = 0; while (next < from) { - next += bh->b_size; - bh = bh->b_this_page; + next += blksize; + block++; } start = next; do { - next += bh->b_size; - if (buffer_mapped(bh)) { + next += blksize; + ret = needs_empty_write(block, inode); + if (unlikely(ret < 0)) + return ret; + if (ret == 0) { if (end) { - error = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(error)) - return error; + ret = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(ret)) + return ret; empty_write_end(page, start, end); end = 0; } @@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) } else end = next; - bh = bh->b_this_page; + block++; } while (next < to); if (end) { - error = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(error)) - return error; + ret = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(ret)) + return ret; empty_write_end(page, start, end); } @@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); flock_lock_file_wait(file, fl); - if (fl_gh->gh_gl) - gfs2_glock_dq_uninit(fl_gh); + if (fl_gh->gh_gl) { + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_uninit(fl_gh); + } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 08a8beb152e..e2431313491 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -26,6 +26,9 @@ #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> #include "gfs2.h" #include "incore.h" @@ -41,10 +44,6 @@ #define CREATE_TRACE_POINTS #include "trace_gfs2.h" -struct gfs2_gl_hash_bucket { - struct hlist_head hb_list; -}; - struct gfs2_glock_iter { int hash; /* hash bucket index */ struct gfs2_sbd *sdp; /* incore superblock */ @@ -54,7 +53,6 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); @@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) #define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) -static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; +static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE]; static struct dentry *gfs2_root; -/* - * Despite what you might think, the numbers below are not arbitrary :-) - * They are taken from the ipv4 routing hash code, which is well tested - * and thus should be nearly optimal. Later on we might tweek the numbers - * but for now this should be fine. - * - * The reason for putting the locks in a separate array from the list heads - * is that we can have fewer locks than list heads and save memory. We use - * the same hash function for both, but with a different hash mask. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ - defined(CONFIG_PROVE_LOCKING) - -#ifdef CONFIG_LOCKDEP -# define GL_HASH_LOCK_SZ 256 -#else -# if NR_CPUS >= 32 -# define GL_HASH_LOCK_SZ 4096 -# elif NR_CPUS >= 16 -# define GL_HASH_LOCK_SZ 2048 -# elif NR_CPUS >= 8 -# define GL_HASH_LOCK_SZ 1024 -# elif NR_CPUS >= 4 -# define GL_HASH_LOCK_SZ 512 -# else -# define GL_HASH_LOCK_SZ 256 -# endif -#endif - -/* We never want more locks than chains */ -#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ -# undef GL_HASH_LOCK_SZ -# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE -#endif - -static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ]; - -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)]; -} -#else /* not SMP, so no spinlocks required */ -static inline rwlock_t *gl_lock_addr(unsigned int x) -{ - return NULL; -} -#endif - /** * gl_hash() - Turn glock number into hash bucket number * @lock: The glock number @@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp, return h; } -/** - * glock_free() - Perform a few checks and then release struct gfs2_glock - * @gl: The glock to release - * - * Also calls lock module to release its internal structure for this glock. - * - */ +static inline void spin_lock_bucket(unsigned int hash) +{ + struct hlist_bl_head *bl = &gl_hash_table[hash]; + bit_spin_lock(0, (unsigned long *)bl); +} -static void glock_free(struct gfs2_glock *gl) +static inline void spin_unlock_bucket(unsigned int hash) +{ + struct hlist_bl_head *bl = &gl_hash_table[hash]; + __bit_spin_unlock(0, (unsigned long *)bl); +} + +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + if (gl->gl_ops->go_flags & GLOF_ASPACE) + kmem_cache_free(gfs2_glock_aspace_cachep, gl); + else + kmem_cache_free(gfs2_glock_cachep, gl); +} + +void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - struct address_space *mapping = gfs2_glock2aspace(gl); - struct kmem_cache *cachep = gfs2_glock_cachep; - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); - trace_gfs2_glock_put(gl); - if (mapping) - cachep = gfs2_glock_aspace_cachep; - sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl); + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); } /** @@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; + /* assert_spin_locked(&gl->gl_spin); */ + if (gl->gl_state == LM_ST_UNLOCKED) return 0; - if (!list_empty(&gl->gl_holders)) + if (test_bit(GLF_LFLUSH, &gl->gl_flags)) + return 0; + if ((gl->gl_name.ln_type != LM_TYPE_INODE) && + !list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) return glops->go_demote_ok(gl); return 1; } + /** - * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock * + * If the glock is demotable, then we add it (or move it) to the end + * of the glock LRU list. */ -static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { - int may_reclaim; - may_reclaim = (demote_ok(gl) && - (atomic_read(&gl->gl_ref) == 1 || - (gl->gl_name.ln_type == LM_TYPE_INODE && - atomic_read(&gl->gl_ref) <= 2))); - spin_lock(&lru_lock); - if (list_empty(&gl->gl_lru) && may_reclaim) { + if (demote_ok(gl)) { + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + list_add_tail(&gl->gl_lru, &lru_list); - atomic_inc(&lru_count); + spin_unlock(&lru_lock); } - spin_unlock(&lru_lock); +} + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + __gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); } /** @@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl) { if (atomic_dec_and_test(&gl->gl_ref)) GLOCK_BUG_ON(gl, 1); - gfs2_glock_schedule_for_reclaim(gl); } /** @@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl) * */ -int gfs2_glock_put(struct gfs2_glock *gl) +void gfs2_glock_put(struct gfs2_glock *gl) { - int rv = 0; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); - write_lock(gl_lock_addr(gl->gl_hash)); - if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { - hlist_del(&gl->gl_list); + if (atomic_dec_and_test(&gl->gl_ref)) { + spin_lock_bucket(gl->gl_hash); + hlist_bl_del_rcu(&gl->gl_list); + spin_unlock_bucket(gl->gl_hash); + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); } spin_unlock(&lru_lock); - write_unlock(gl_lock_addr(gl->gl_hash)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - glock_free(gl); - rv = 1; - goto out; + GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } - spin_lock(&gl->gl_spin); - gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); - write_unlock(gl_lock_addr(gl->gl_hash)); -out: - return rv; } /** @@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash, const struct lm_lockname *name) { struct gfs2_glock *gl; - struct hlist_node *h; + struct hlist_bl_node *h; - hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { + hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) { if (!lm_name_equal(&gl->gl_name, name)) continue; if (gl->gl_sbd != sdp) continue; - - atomic_inc(&gl->gl_ref); - - return gl; + if (atomic_inc_not_zero(&gl->gl_ref)) + return gl; } return NULL; @@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct gfs2_glock *gl, *tmp; unsigned int hash = gl_hash(sdp, &name); struct address_space *mapping; + struct kmem_cache *cachep; - read_lock(gl_lock_addr(hash)); + rcu_read_lock(); gl = search_bucket(hash, sdp, &name); - read_unlock(gl_lock_addr(hash)); + rcu_read_unlock(); *glp = gl; if (gl) @@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return -ENOENT; if (glops->go_flags & GLOF_ASPACE) - gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL); + cachep = gfs2_glock_aspace_cachep; else - gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); + cachep = gfs2_glock_cachep; + gl = kmem_cache_alloc(cachep, GFP_KERNEL); if (!gl) return -ENOMEM; @@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } - write_lock(gl_lock_addr(hash)); + spin_lock_bucket(hash); tmp = search_bucket(hash, sdp, &name); if (tmp) { - write_unlock(gl_lock_addr(hash)); - glock_free(gl); + spin_unlock_bucket(hash); + kmem_cache_free(cachep, gl); + atomic_dec(&sdp->sd_glock_disposal); gl = tmp; } else { - hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); - write_unlock(gl_lock_addr(hash)); + hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]); + spin_unlock_bucket(hash); } *glp = gl; @@ -1007,13 +978,13 @@ fail: insert_pt = &gh2->gh_list; } set_bit(GLF_QUEUED, &gl->gl_flags); + trace_gfs2_glock_queue(gh, 1); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) goto do_cancel; return; } - trace_gfs2_glock_queue(gh, 1); list_add_tail(&gh->gh_list, insert_pt); do_cancel: gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); @@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } + __gfs2_glock_schedule_for_reclaim(gl); trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); if (likely(fast_path)) @@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { - unsigned int x; - - for (x = 0; x < num_gh; x++) - gfs2_glock_dq(&ghs[x]); + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); } /** @@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) { - unsigned int x; - - for (x = 0; x < num_gh; x++) - gfs2_glock_dq_uninit(&ghs[x]); + while (num_gh--) + gfs2_glock_dq_uninit(&ghs[num_gh]); } void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) @@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = { * @sdp: the filesystem * @bucket: the bucket * - * Returns: 1 if the bucket has entries */ -static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, +static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp, unsigned int hash) { - struct gfs2_glock *gl, *prev = NULL; - int has_entries = 0; - struct hlist_head *head = &gl_hash_table[hash].hb_list; + struct gfs2_glock *gl; + struct hlist_bl_head *head = &gl_hash_table[hash]; + struct hlist_bl_node *pos; - read_lock(gl_lock_addr(hash)); - /* Can't use hlist_for_each_entry - don't want prefetch here */ - if (hlist_empty(head)) - goto out; - gl = list_entry(head->first, struct gfs2_glock, gl_list); - while(1) { - if (!sdp || gl->gl_sbd == sdp) { - gfs2_glock_hold(gl); - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); - prev = gl; + rcu_read_lock(); + hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) { + if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref)) examiner(gl); - has_entries = 1; - read_lock(gl_lock_addr(hash)); - } - if (gl->gl_list.next == NULL) - break; - gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list); } -out: - read_unlock(gl_lock_addr(hash)); - if (prev) - gfs2_glock_put(prev); + rcu_read_unlock(); cond_resched(); - return has_entries; +} + +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + unsigned x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(examiner, sdp, x); } @@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl) void gfs2_glock_thaw(struct gfs2_sbd *sdp) { - unsigned x; + glock_hash_walk(thaw_glock, sdp); +} - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(thaw_glock, sdp, x); +static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = __dump_glock(seq, gl); + spin_unlock(&gl->gl_spin); + return ret; +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl); } /** @@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { - unsigned int x; - - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(clear_glock, sdp, x); + glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); - gfs2_dump_lockstate(sdp); + glock_hash_walk(dump_glock_func, sdp); } void gfs2_glock_finish_truncate(struct gfs2_inode *ip) @@ -1717,73 +1681,22 @@ out: return error; } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) -{ - int ret; - spin_lock(&gl->gl_spin); - ret = __dump_glock(seq, gl); - spin_unlock(&gl->gl_spin); - return ret; -} -/** - * gfs2_dump_lockstate - print out the current lockstate - * @sdp: the filesystem - * @ub: the buffer to copy the information into - * - * If @ub is NULL, dump the lockstate to the console. - * - */ - -static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) -{ - struct gfs2_glock *gl; - struct hlist_node *h; - unsigned int x; - int error = 0; - - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { - - read_lock(gl_lock_addr(x)); - - hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) { - if (gl->gl_sbd != sdp) - continue; - - error = dump_glock(NULL, gl); - if (error) - break; - } - - read_unlock(gl_lock_addr(x)); - - if (error) - break; - } - - - return error; -} int __init gfs2_glock_init(void) { unsigned i; for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { - INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); - } -#ifdef GL_HASH_LOCK_SZ - for(i = 0; i < GL_HASH_LOCK_SZ; i++) { - rwlock_init(&gl_hash_locks[i]); + INIT_HLIST_BL_HEAD(&gl_hash_table[i]); } -#endif glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_FREEZEABLE, 0); + WQ_HIGHPRI | WQ_FREEZABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", - WQ_MEM_RECLAIM | WQ_FREEZEABLE, + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); @@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void) destroy_workqueue(gfs2_delete_workqueue); } +static inline struct gfs2_glock *glock_hash_chain(unsigned hash) +{ + return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]), + struct gfs2_glock, gl_list); +} + +static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl) +{ + return hlist_bl_entry(rcu_dereference(gl->gl_list.next), + struct gfs2_glock, gl_list); +} + static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { struct gfs2_glock *gl; -restart: - read_lock(gl_lock_addr(gi->hash)); - gl = gi->gl; - if (gl) { - gi->gl = hlist_entry(gl->gl_list.next, - struct gfs2_glock, gl_list); - } else { - gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, - struct gfs2_glock, gl_list); - } - if (gi->gl) - gfs2_glock_hold(gi->gl); - read_unlock(gl_lock_addr(gi->hash)); - if (gl) - gfs2_glock_put(gl); - while (gi->gl == NULL) { - gi->hash++; - if (gi->hash >= GFS2_GL_HASH_SIZE) - return 1; - read_lock(gl_lock_addr(gi->hash)); - gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, - struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); - read_unlock(gl_lock_addr(gi->hash)); - } - - if (gi->sdp != gi->gl->gl_sbd) - goto restart; + do { + gl = gi->gl; + if (gl) { + gi->gl = glock_hash_next(gl); + } else { + gi->gl = glock_hash_chain(gi->hash); + } + while (gi->gl == NULL) { + gi->hash++; + if (gi->hash >= GFS2_GL_HASH_SIZE) { + rcu_read_unlock(); + return 1; + } + gi->gl = glock_hash_chain(gi->hash); + } + /* Skip entries for other sb and dead entries */ + } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0); return 0; } -static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi) -{ - if (gi->gl) - gfs2_glock_put(gi->gl); - gi->gl = NULL; -} - static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; gi->hash = 0; + rcu_read_lock(); do { - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + if (gfs2_glock_iter_next(gi)) return NULL; - } } while (n--); return gi->gl; @@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; - if (gfs2_glock_iter_next(gi)) { - gfs2_glock_iter_free(gi); + if (gfs2_glock_iter_next(gi)) return NULL; - } return gi->gl; } @@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - gfs2_glock_iter_free(gi); + + if (gi->gl) + rcu_read_unlock(); + gi->gl = NULL; } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 691851ceb61..aea160690e9 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -118,7 +118,7 @@ struct lm_lockops { int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); - void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); + void (*lm_put_lock) (struct gfs2_glock *gl); int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); @@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, int create, struct gfs2_glock **glp); void gfs2_glock_hold(struct gfs2_glock *gl); void gfs2_glock_put_nolock(struct gfs2_glock *gl); -int gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); void gfs2_holder_reinit(unsigned int state, unsigned flags, @@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, return error; } -/* Lock Value Block functions */ - -int gfs2_lvb_hold(struct gfs2_glock *gl); -void gfs2_lvb_unhold(struct gfs2_glock *gl); - -void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); -void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -void gfs2_reclaim_glock(struct gfs2_sbd *sdp); -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -void gfs2_glock_finish_truncate(struct gfs2_inode *ip); -void gfs2_glock_thaw(struct gfs2_sbd *sdp); - -int __init gfs2_glock_init(void); -void gfs2_glock_exit(void); - -int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); -void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -int gfs2_register_debugfs(void); -void gfs2_unregister_debugfs(void); +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); + +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); + +extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern int gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 263561bf1a5..3754e3cbf02 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) BUG_ON(current->journal_info); current->journal_info = &tr; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_ail_gl_list); bh = bd->bd_bh; gfs2_remove_from_ail(bd); + spin_unlock(&sdp->sd_ail_lock); + bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_log_lock(sdp); gfs2_assert_withdraw(sdp, !buffer_busy(bh)); gfs2_trans_add_revoke(sdp, bd); + gfs2_log_unlock(sdp); + + spin_lock(&sdp->sd_ail_lock); } gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_holder *gh; + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) return 0; + + if (!list_empty(&gl->gl_holders)) { + gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); + if (gh->gh_list.next != &gl->gl_holders) + return 0; + } + return 1; } @@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) } /** - * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int rgrp_go_demote_ok(const struct gfs2_glock *gl) -{ - const struct address_space *mapping = (const struct address_space *)(gl + 1); - return !mapping->nrpages; -} - -/** * rgrp_go_lock - operation done after an rgrp lock is locked by * a first holder on this node. * @gl: the glock @@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_xmote_th = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a79790c0627..870a89d6d4d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -15,6 +15,8 @@ #include <linux/workqueue.h> #include <linux/dlm.h> #include <linux/buffer_head.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -201,7 +203,7 @@ enum { }; struct gfs2_glock { - struct hlist_node gl_list; + struct hlist_bl_node gl_list; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; atomic_t gl_ref; @@ -234,6 +236,7 @@ struct gfs2_glock { atomic_t gl_ail_count; struct delayed_work gl_work; struct work_struct gl_delete; + struct rcu_head gl_rcu; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ @@ -314,6 +317,7 @@ enum { QDF_USER = 0, QDF_CHANGE = 1, QDF_LOCKED = 2, + QDF_REFRESH = 3, }; struct gfs2_quota_data { @@ -647,6 +651,7 @@ struct gfs2_sbd { unsigned int sd_log_flush_head; u64 sd_log_flush_wrapped; + spinlock_t sd_ail_lock; struct list_head sd_ail1_list; struct list_head sd_ail2_list; u64 sd_ail_sync_gen; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7aa7d4f8984..97d54a28776 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -763,14 +763,15 @@ fail: return error; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, + const struct qstr *qstr) { int err; size_t len; void *value; char *name; - err = security_inode_init_security(&ip->i_inode, &dip->i_inode, + err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, &name, &value, &len); if (err) { @@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - error = gfs2_security_init(dip, GFS2_I(inode)); + error = gfs2_security_init(dip, GFS2_I(inode), name); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 6e493aee28f..98c80d8c2a6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -22,7 +22,6 @@ static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl->gl_state; - struct gfs2_sbd *sdp = gl->gl_sbd; BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); @@ -31,12 +30,7 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ - if (gl->gl_ops->go_flags & GLOF_ASPACE) - kmem_cache_free(gfs2_glock_aspace_cachep, gl); - else - kmem_cache_free(gfs2_glock_cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + gfs2_glock_free(gl); return; case -DLM_ECANCEL: /* Cancel while getting lock */ ret |= LM_OUT_CANCELED; @@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } -static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) +static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; if (gl->gl_lksb.sb_lkid == 0) { - kmem_cache_free(cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb01f3575e1..e7ed31f858d 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, * @mapping: The associated mapping (maybe NULL) * @bd: The gfs2_bufdata to remove * - * The log lock _must_ be held when calling this function + * The ail lock _must_ be held when calling this function * */ @@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) */ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) -__releases(&sdp->sd_log_lock) -__acquires(&sdp->sd_log_lock) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -117,7 +117,7 @@ __acquires(&sdp->sd_log_lock) list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); get_bh(bh); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; @@ -126,7 +126,7 @@ __acquires(&sdp->sd_log_lock) unlock_buffer(bh); brelse(bh); } - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); retry = 1; break; @@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) struct gfs2_ail *ai; int done = 0; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); head = &sdp->sd_ail1_list; if (list_empty(head)) { - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return; } sync_gen = sdp->sd_ail_sync_gen++; @@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) if (ai->ai_sync_gen >= sync_gen) continue; ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ + gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */ done = 0; break; } } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); } static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) @@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) struct gfs2_ail *ai, *s; int ret; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { if (gfs2_ail1_empty_one(sdp, ai, flags)) @@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) ret = list_empty(&sdp->sd_ail1_list); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return ret; } @@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) int wrap = (new_tail < old_tail); int a, b, rm; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { a = (old_tail <= ai->ai_first); @@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) kfree(ai); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); } /** @@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) struct gfs2_ail *ai; unsigned int tail; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; @@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) tail = ai->ai_first; } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ail_lock); return tail; } @@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; + spin_lock(&sdp->sd_ail_lock); if (!list_empty(&ai->ai_ail1_list)) { list_add(&ai->ai_list, &sdp->sd_ail1_list); ai = NULL; } + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index bf33f822058..e919abf25ec 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) /* If this buffer is in the AIL and it has already been written * to in-place disk block, remove it from the AIL. */ + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + spin_unlock(&sdp->sd_ail_lock); get_bh(bh); atomic_inc(&sdp->sd_log_pinned); trace_gfs2_pin(bd, 1); @@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, mark_buffer_dirty(bh); clear_buffer_pinned(bh); - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { list_del(&bd->bd_ail_st_list); brelse(bh); @@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, } bd->bd_ail = ai; list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); - clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + spin_unlock(&sdp->sd_ail_lock); + + if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags)) + gfs2_glock_schedule_for_reclaim(bd->bd_gl); trace_gfs2_pin(bd, 0); - gfs2_log_unlock(sdp); unlock_buffer(bh); atomic_dec(&sdp->sd_log_pinned); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index ebef7ab6e17..888a5f5a1a5 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -14,6 +14,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/gfs2_ondisk.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> #include <asm/atomic.h> #include "gfs2.h" @@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - INIT_HLIST_NODE(&gl->gl_list); + INIT_HLIST_BL_NODE(&gl->gl_list); spin_lock_init(&gl->gl_spin); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); @@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo) struct address_space *mapping = (struct address_space *)(gl + 1); gfs2_init_glock_once(gl); - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + address_space_init_once(mapping); } /** @@ -144,7 +139,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", - WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0); + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); if (!gfs_recovery_wq) goto fail_wq; @@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); + rcu_barrier(); + kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 939739c7b3f..01d97f48655 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -326,6 +326,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int brelse(bh); } if (bd) { + spin_lock(&sdp->sd_ail_lock); if (bd->bd_ail) { gfs2_remove_from_ail(bd); bh->b_private = NULL; @@ -333,6 +334,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int bd->bd_blkno = bh->b_blocknr; gfs2_trans_add_revoke(sdp, bd); } + spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 777927ce6f7..42ef24355af 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_log_waitq); init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); @@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = { { Opt_err, NULL }, }; -static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - kmem_cache_free(cachep, gl); - if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); -} - static const struct lm_lockops nolock_ops = { .lm_proto_name = "lock_nolock", - .lm_put_lock = nolock_put_lock, + .lm_put_lock = gfs2_glock_free, .lm_tokens = &nolock_tokens, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index d8b26ac2e20..09e436a5072 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) /** * gfs2_permission - - * @inode: - * @mask: - * @nd: passed from Linux VFS, ignored by us + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not * * This may be called from the VFS directly, or from within GFS2 with the * inode locked, so we look to see if the glock is already locked and only @@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags) int error; int unlock = 0; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a689901963d..e23d9864c41 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out_end_trans; do_qc(qd, -qd->qd_change_sync); + set_bit(QDF_REFRESH, &qd->qd_flags); } error = 0; @@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_quota_data *qd; unsigned int x; int error = 0; @@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) sort_qd, NULL); for (x = 0; x < al->al_qd_num; x++) { - error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); + int force = NO_FORCE; + qd = al->al_qd[x]; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force = FORCE; + error = do_glock(qd, force, &al->al_qd_ghs[x]); if (error) break; } @@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, offset = qd2offset(qd); alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; if (alloc_required) { al = gfs2_alloc_get(ip); if (al == NULL) @@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, blocks += gfs2_rg_blocks(al); } - error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); if (error) goto out_release; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7293ea27020..cf930cd9664 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1602,7 +1602,7 @@ rgrp_error: * */ -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); +} +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_data(ip, bstart, blen); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); } @@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) * */ -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); + gfs2_meta_wipe(ip, bstart, blen); +} +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_meta(ip, bstart, blen); gfs2_statfs_change(sdp, 0, +blen, 0); gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, bstart, blen); } void gfs2_unlink_di(struct inode *inode) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 50c2bb04369..a80e3034ac4 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); +extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index afa66aaa223..b4d70b13be9 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) } /* - * hfs_unlink() + * hfs_remove() * - * This is the unlink() entry in the inode_operations structure for - * regular HFS directories. The purpose is to delete an existing - * file, given the inode for the parent directory and the name - * (and its length) of the existing file. - */ -static int hfs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode; - int res; - - inode = dentry->d_inode; - res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); - if (res) - return res; - - drop_nlink(inode); - hfs_delete_inode(inode); - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - - return res; -} - -/* - * hfs_rmdir() + * This serves as both unlink() and rmdir() in the inode_operations + * structure for regular HFS directories. The purpose is to delete + * an existing child, given the inode for the parent directory and + * the name (and its length) of the existing directory. * - * This is the rmdir() entry in the inode_operations structure for - * regular HFS directories. The purpose is to delete an existing - * directory, given the inode for the parent directory and the name - * (and its length) of the existing directory. + * HFS does not have hardlinks, so both rmdir and unlink set the + * link count to 0. The only difference is the emptiness check. */ -static int hfs_rmdir(struct inode *dir, struct dentry *dentry) +static int hfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode; + struct inode *inode = dentry->d_inode; int res; - inode = dentry->d_inode; - if (inode->i_size != 2) + if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) @@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - res = hfs_unlink(new_dir, new_dentry); + res = hfs_remove(new_dir, new_dentry); if (res) return res; } @@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = { const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, - .unlink = hfs_unlink, + .unlink = hfs_remove, .mkdir = hfs_mkdir, - .rmdir = hfs_rmdir, + .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, }; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 52a0bcaa7b6..b1991a2a08e 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode) u32 start, len, goal; int res; - if (sbi->total_blocks - sbi->free_blocks + 8 > - sbi->alloc_file->i_size * 8) { + if (sbi->alloc_file->i_size * 8 < + sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ printk(KERN_ERR "hfs: extend alloc file! " "(%llu,%u,%u)\n", diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index d66ad113b1c..40ad88c12c6 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb, res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, data, READ); if (res) - return res; + goto out; switch (be16_to_cpu(*((__be16 *)data))) { case HFS_OLD_PMAP_MAGIC: @@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb, res = -ENOENT; break; } - +out: kfree(data); return res; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9a3b4795f43..b49b55584c8 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) struct inode *root, *inode; struct qstr str; struct nls_table *nls = NULL; - int err = -EINVAL; + int err; + err = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; + goto out; sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); hfsplus_fill_defaults(sbi); + + err = -EINVAL; if (!hfsplus_parse_options(data, sbi)) { printk(KERN_ERR "hfs: unable to parse mount options\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } /* temporarily use utf8 to correctly find the hidden dir below */ @@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sbi->nls = load_nls("utf8"); if (!sbi->nls) { printk(KERN_ERR "hfs: unable to load nls for utf8\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); - err = -EINVAL; - goto cleanup; + goto out_unload_nls; } vhdr = sbi->s_vhdr; @@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { printk(KERN_ERR "hfs: wrong filesystem version\n"); - goto cleanup; + goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); @@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { printk(KERN_ERR "hfs: failed to load extents file\n"); - goto cleanup; + goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { printk(KERN_ERR "hfs: failed to load catalog file\n"); - goto cleanup; + goto out_close_ext_tree; } inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { printk(KERN_ERR "hfs: failed to load allocation file\n"); err = PTR_ERR(inode); - goto cleanup; + goto out_close_cat_tree; } sbi->alloc_file = inode; @@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(root)) { printk(KERN_ERR "hfs: failed to load root directory\n"); err = PTR_ERR(root); - goto cleanup; - } - sb->s_d_op = &hfsplus_dentry_operations; - sb->s_root = d_alloc_root(root); - if (!sb->s_root) { - iput(root); - err = -ENOMEM; - goto cleanup; + goto out_put_alloc_file; } str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; @@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) - goto cleanup; + goto out_put_root; inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto cleanup; + goto out_put_root; } sbi->hidden_dir = inode; } else hfs_find_exit(&fd); - if (sb->s_flags & MS_RDONLY) - goto out; + if (!(sb->s_flags & MS_RDONLY)) { + /* + * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused + * all three are registered with Apple for our use + */ + vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); + vhdr->modify_date = hfsp_now2mt(); + be32_add_cpu(&vhdr->write_count, 1); + vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); + vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); + hfsplus_sync_fs(sb, 1); - /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused - * all three are registered with Apple for our use - */ - vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); - vhdr->modify_date = hfsp_now2mt(); - be32_add_cpu(&vhdr->write_count, 1); - vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); - vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); - hfsplus_sync_fs(sb, 1); - - if (!sbi->hidden_dir) { - mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, - &str, sbi->hidden_dir); - mutex_unlock(&sbi->vh_mutex); - - hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); + if (!sbi->hidden_dir) { + mutex_lock(&sbi->vh_mutex); + sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, + sbi->hidden_dir); + mutex_unlock(&sbi->vh_mutex); + + hfsplus_mark_inode_dirty(sbi->hidden_dir, + HFSPLUS_I_CAT_DIRTY); + } } -out: + + sb->s_d_op = &hfsplus_dentry_operations; + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + err = -ENOMEM; + goto out_put_hidden_dir; + } + unload_nls(sbi->nls); sbi->nls = nls; return 0; -cleanup: - hfsplus_put_super(sb); +out_put_hidden_dir: + iput(sbi->hidden_dir); +out_put_root: + iput(sbi->alloc_file); +out_put_alloc_file: + iput(sbi->alloc_file); +out_close_cat_tree: + hfs_btree_close(sbi->cat_tree); +out_close_ext_tree: + hfs_btree_close(sbi->ext_tree); +out_free_vhdr: + kfree(sbi->s_vhdr); + kfree(sbi->s_backup_vhdr); +out_unload_nls: + unload_nls(sbi->nls); unload_nls(nls); + kfree(sbi); +out: return err; } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 196231794f6..3031d81f5f0 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -167,7 +167,7 @@ reread: break; case cpu_to_be16(HFSP_WRAP_MAGIC): if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) - goto out; + goto out_free_backup_vhdr; wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; part_size = wd.embed_count * wd.ablk_size; @@ -179,7 +179,7 @@ reread: * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) - goto out; + goto out_free_backup_vhdr; goto reread; } diff --git a/fs/inode.c b/fs/inode.c index da85e56378f..9910c039f02 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); /* - * iprune_sem provides exclusion between the kswapd or try_to_free_pages - * icache shrinking path, and the umount path. Without this exclusion, - * by the time prune_icache calls iput for the inode whose pages it has - * been invalidating, or by the time it calls clear_inode & destroy_inode - * from its final dispose_list, the struct super_block they refer to - * (for inode->i_sb->s_op) may already have been freed and reused. + * iprune_sem provides exclusion between the icache shrinking and the + * umount path. * - * We make this an rwsem because the fastpath is icache shrinking. In - * some cases a filesystem may be doing a significant amount of work in - * its inode reclaim code, so this should improve parallelism. + * We don't actually need it to protect anything in the umount path, + * but only need to cycle through it to make sure any inode that + * prune_icache took off the LRU list has been fully torn down by the + * time we are past evict_inodes. */ static DECLARE_RWSEM(iprune_sem); @@ -295,6 +292,20 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + mutex_init(&mapping->unmap_mutex); +} +EXPORT_SYMBOL(address_space_init_once); + /* * These are initializations that only need to be done * once, because the fields are idempotent across use @@ -308,13 +319,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); - INIT_LIST_HEAD(&inode->i_data.private_list); - spin_lock_init(&inode->i_data.private_lock); - INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); + address_space_init_once(&inode->i_data); i_size_ordered_init(inode); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); @@ -508,17 +513,12 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - WARN_ON(1); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; - } inode->i_state |= I_FREEING; @@ -534,28 +534,40 @@ void evict_inodes(struct super_block *sb) spin_unlock(&inode_lock); dispose_list(&dispose); + + /* + * Cycle through iprune_sem to make sure any inode that prune_icache + * moved off the list before we took the lock has been fully torn + * down. + */ + down_write(&iprune_sem); up_write(&iprune_sem); } /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; LIST_HEAD(dispose); - down_write(&iprune_sem); - spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; + if (inode->i_state & I_DIRTY && !kill_dirty) { + busy = 1; + continue; + } if (atomic_read(&inode->i_count)) { busy = 1; continue; @@ -575,7 +587,6 @@ int invalidate_inodes(struct super_block *sb) spin_unlock(&inode_lock); dispose_list(&dispose); - up_write(&iprune_sem); return busy; } diff --git a/fs/internal.h b/fs/internal.h index 0663568b124..f3d15de44b1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,10 +106,23 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); /* * inode.c */ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/ioctl.c b/fs/ioctl.c index a59635e295f..1eebeb72b20 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode, len = isize; } + /* + * Some filesystems can't deal with being asked to map less than + * blocksize, so make sure our len is at least block length. + */ + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index ed752cb3847..dd4687ff30d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *max_len = 5; + return 255; + } else if (len < 3) { + *max_len = 3; return 255; + } len = 3; fh32[0] = ei->i_iget5_block; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9e4686900f1..97e73469b2c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction commit was started. + * Called with j_state_lock locked for writing. + * Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal) { transaction_t *transaction = NULL; tid_t tid; + int need_to_start = 0; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; - __jbd2_log_start_commit(journal, transaction->t_tid); + if (!tid_geq(journal->j_commit_request, transaction->t_tid)) + need_to_start = 1; } else if (journal->j_committing_transaction) transaction = journal->j_committing_transaction; @@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) tid = transaction->t_tid; read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); return 1; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index faad2bd787c..1d1191050f9 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction) static int start_this_handle(journal_t *journal, handle_t *handle, int gfp_mask) { - transaction_t *transaction; - int needed; - int nblocks = handle->h_buffer_credits; - transaction_t *new_transaction = NULL; + transaction_t *transaction, *new_transaction = NULL; + tid_t tid; + int needed, need_to_start; + int nblocks = handle->h_buffer_credits; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -222,8 +222,11 @@ repeat: atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - __jbd2_log_start_commit(journal, transaction->t_tid); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int ret; + tid_t tid; + int need_to_start, ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ @@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "restarting handle %p\n", handle); - __jbd2_log_start_commit(journal, transaction->t_tid); + tid = transaction->t_tid; + need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 92978658ed1..82faddd1f32 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, no chance of AB-BA deadlock involving its f->sem). */ mutex_unlock(&f->sem); - ret = jffs2_do_create(c, dir_f, f, ri, - dentry->d_name.name, dentry->d_name.len); + ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name); if (ret) goto fail; @@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; @@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; @@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de jffs2_complete_reservation(c); - ret = jffs2_init_security(inode, dir_i); + ret = jffs2_init_security(inode, dir_i, &dentry->d_name); if (ret) goto fail; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 5a53d9bdb2b..e4619b00f7c 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, unsigned char *buf, uint32_t offset, uint32_t writelen, uint32_t *retlen); int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, - struct jffs2_raw_inode *ri, const char *name, int namelen); + struct jffs2_raw_inode *ri, const struct qstr *qstr); int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, int namelen, struct jffs2_inode_info *dead_f, uint32_t time); int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 239f51216a6..cfeb7164b08 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -23,14 +23,15 @@ #include "nodelist.h" /* ---- Initial Security Label Attachment -------------- */ -int jffs2_init_security(struct inode *inode, struct inode *dir) +int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int rc; size_t len; void *value; char *name; - rc = security_inode_init_security(inode, dir, &name, &value, &len); + rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len); if (rc) { if (rc == -EOPNOTSUPP) return 0; diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index c819eb0e982..30d175b6d29 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, return ret; } -int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen) +int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, + struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, + const struct qstr *qstr) { struct jffs2_raw_dirent *rd; struct jffs2_full_dnode *fn; @@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str mutex_unlock(&f->sem); jffs2_complete_reservation(c); - ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); + ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr); if (ret) return ret; ret = jffs2_init_acl_post(&f->vfs_inode); if (ret) return ret; - ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, - ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); + ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen, + ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len)); if (ret) { /* Eep. */ @@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); - rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); + rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len); rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); rd->pino = cpu_to_je32(dir_f->inocache->ino); rd->version = cpu_to_je32(++dir_f->highest_version); rd->ino = ri->ino; rd->mctime = ri->ctime; - rd->nsize = namelen; + rd->nsize = qstr->len; rd->type = DT_REG; rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); - rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); + rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len)); - fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); + fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL); jffs2_free_raw_dirent(rd); diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index cf4f5759b42..7be4beb306f 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #endif /* CONFIG_JFFS2_FS_XATTR */ #ifdef CONFIG_JFFS2_FS_SECURITY -extern int jffs2_init_security(struct inode *inode, struct inode *dir); +extern int jffs2_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); extern const struct xattr_handler jffs2_security_xattr_handler; #else -#define jffs2_init_security(inode,dir) (0) +#define jffs2_init_security(inode,dir,qstr) (0) #endif /* CONFIG_JFFS2_FS_SECURITY */ #endif /* _JFFS2_FS_XATTR_H_ */ diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index 88b6cc535bf..e9e100fd7c0 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); extern int jfs_removexattr(struct dentry *, const char *); #ifdef CONFIG_JFS_SECURITY -extern int jfs_init_security(tid_t, struct inode *, struct inode *); +extern int jfs_init_security(tid_t, struct inode *, struct inode *, + const struct qstr *); #else static inline int jfs_init_security(tid_t tid, struct inode *inode, - struct inode *dir) + struct inode *dir, const struct qstr *qstr) { return 0; } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 81ead850ddb..eaaf2b511e8 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, if (rc) goto out3; - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) if (rc) goto out3; - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == JFS_LINK_MAX) return -EMLINK; - if (ip->i_nlink == 0) - return -ENOENT; - dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); @@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); - rc = jfs_init_security(tid, ip, dip); + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) goto out3; @@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (rc) goto out3; - rc = jfs_init_security(tid, ip, dir); + rc = jfs_init_security(tid, ip, dir, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; @@ -1600,7 +1597,7 @@ out: static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* * This is not negative dentry. Always valid. diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 2d7f165d0f1..3fa4c32272d 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name) } #ifdef CONFIG_JFS_SECURITY -int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int rc; size_t len; @@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) char *suffix; char *name; - rc = security_inode_init_security(inode, dir, &suffix, &value, &len); + rc = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); if (rc) { if (rc == -EOPNOTSUPP) return 0; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index ce7337ddfdb..6e6777f1b4b 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, new_de = minix_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); minix_set_link(new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= info->s_link_max) goto out_dir; } - inode_inc_link_count(old_inode); err = minix_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } minix_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { minix_set_link(dir_de, dir_page, new_dir); diff --git a/fs/namei.c b/fs/namei.c index 7d77f24d32a..b912b7abe74 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -char * getname(const char __user * filename) +static char *getname_flags(const char __user * filename, int flags) { char *tmp, *result; @@ -147,14 +147,21 @@ char * getname(const char __user * filename) result = tmp; if (retval < 0) { - __putname(tmp); - result = ERR_PTR(retval); + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); return result; } +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0); +} + #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { @@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; struct dentry *dentry = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) goto err; BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) err: spin_unlock(&dentry->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -454,17 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - - /* - * It can be possible to revalidate the dentry that we started - * the path walk with. force_reval_path may also revalidate the - * dentry already committed to the nameidata. - */ - if (unlikely(parent == dentry)) - return nameidata_drop_rcu(nd); + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -484,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry parent->d_count++; spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -498,7 +501,7 @@ err: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -506,8 +509,16 @@ err_root: /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) { - if (nd->flags & LOOKUP_RCU) - return nameidata_dentry_drop_rcu(nd, dentry); + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } return 0; } @@ -526,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; spin_lock(&dentry->d_lock); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_unlock; @@ -547,53 +559,31 @@ err_unlock: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) -{ - if (likely(nd->flags & LOOKUP_RCU)) - return nameidata_drop_rcu_last(nd); - return 0; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata */ void release_open_intent(struct nameidata *nd) { - if (nd->intent.open.file->f_path.dentry == NULL) - put_filp(nd->intent.open.file); - else - fput(nd->intent.open.file); -} + struct file *file = nd->intent.open.file; -/* - * Call d_revalidate and handle filesystems that request rcu-walk - * to be dropped. This may be called and return in rcu-walk mode, - * regardless of success or error. If -ECHILD is returned, the caller - * must return -ECHILD back up the path walk stack so path walk may - * be restarted in ref-walk mode. - */ -static int d_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - int status; - - status = dentry->d_op->d_revalidate(dentry, nd); - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return status; - status = dentry->d_op->d_revalidate(dentry, nd); + if (file && !IS_ERR(file)) { + if (file->f_path.dentry == NULL) + put_filp(file); + else + fput(file); } +} - return status; +static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + return dentry->d_op->d_revalidate(dentry, nd); } -static inline struct dentry * +static struct dentry * do_revalidate(struct dentry *dentry, struct nameidata *nd) { - int status; - - status = d_revalidate(dentry, nd); + int status = d_revalidate(dentry, nd); if (unlikely(status <= 0)) { /* * The dentry failed validation. @@ -602,37 +592,18 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) * to return a fail status. */ if (status < 0) { - /* If we're in rcu-walk, we don't have a ref */ - if (!(nd->flags & LOOKUP_RCU)) - dput(dentry); + dput(dentry); dentry = ERR_PTR(status); - - } else { - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu_maybe(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; } } return dentry; } -static inline int need_reval_dot(struct dentry *dentry) -{ - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - return 1; -} - /* - * force_reval_path - force revalidation of a dentry + * handle_reval_path - force revalidation of a dentry * * In some situations the path walking code will trust dentries without * revalidating them. This causes problems for filesystems that depend on @@ -646,30 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry) * invalidate the dentry. It's up to the caller to handle putting references * to the path if necessary. */ -static int -force_reval_path(struct path *path, struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { + struct dentry *dentry = nd->path.dentry; int status; - struct dentry *dentry = path->dentry; - /* - * only check on filesystems where it's possible for the dentry to - * become stale. - */ - if (!need_reval_dot(dentry)) + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) return 0; + /* Note: we do not d_invalidate() */ status = d_revalidate(dentry, nd); if (status > 0) return 0; - if (!status) { - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_drop_rcu(nd)) - return -ECHILD; - d_invalidate(dentry); + if (!status) status = -ESTALE; - } + return status; } @@ -738,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; } nd->inode = nd->path.dentry->d_inode; @@ -767,18 +737,43 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + static __always_inline int -__do_follow_link(const struct path *link, struct nameidata *nd, void **p) +follow_link(struct path *link, struct nameidata *nd, void **p) { int error; struct dentry *dentry = link->dentry; - touch_atime(link->mnt, dentry); - nd_set_link(nd, NULL); + BUG_ON(nd->flags & LOOKUP_RCU); if (link->mnt == nd->path.mnt) mntget(link->mnt); + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + + touch_atime(link->mnt, dentry); + nd_set_link(nd, NULL); + + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -788,50 +783,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) if (s) error = __vfs_follow_link(nd, s); else if (nd->last_type == LAST_BIND) { - error = force_reval_path(&nd->path, nd); - if (error) + nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ path_put(&nd->path); + error = -ELOOP; + } } } return error; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int do_follow_link(struct path *path, struct nameidata *nd) -{ - void *cookie; - int err = -ELOOP; - if (current->link_count >= MAX_NESTED_LINKS) - goto loop; - if (current->total_link_count >= 40) - goto loop; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - cond_resched(); - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; - current->link_count++; - current->total_link_count++; - nd->depth++; - err = __do_follow_link(path, nd, &cookie); - if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) - path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); - path_put(path); - current->link_count--; - nd->depth--; - return err; -loop: - path_put_conditional(path, nd); - path_put(&nd->path); - return err; -} - static int follow_up_rcu(struct path *path) { struct vfsmount *parent; @@ -1070,7 +1033,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) - return -ECHILD; + goto failed; inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; @@ -1083,8 +1046,15 @@ static int follow_dotdot_rcu(struct nameidata *nd) } __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; - return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; } /* @@ -1218,57 +1188,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - struct inode *dir; + int need_reval = 1; + int status = 1; int err; /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, name); - if (err < 0) - return err; - } - - /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; dentry = __d_lookup_rcu(parent, name, &seq, inode); - if (!dentry) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto need_lookup; - } + if (!dentry) + goto unlazy; + /* Memory barrier in read_seqcount_begin of child is enough */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; - if (dentry->d_flags & DCACHE_OP_REVALIDATE) - goto need_revalidate; -done2: + + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } + } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; - if (nameidata_drop_rcu(nd)) - return -ECHILD; - /* fallthru */ +unlazy: + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + } else { + dentry = __d_lookup(parent, name); } - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; -found: - if (dentry->d_flags & DCACHE_OP_REVALIDATE) - goto need_revalidate; -done: + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); + } + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1278,49 +1276,113 @@ done: } *inode = path->dentry->d_inode; return 0; +} -need_lookup: - dir = parent->d_inode; - BUG_ON(nd->inode != dir); +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore, or the first - * lookup failed due to an unrelated rename. - * - * This could use version numbering or similar to avoid unnecessary - * cache lookups, but then we'd have to do the first lookup in the - * non-racy way. However in the common case here, everything should - * be hot in cache, so would it be a big win? - */ - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - goto fail; - goto done; +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); } +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. */ - mutex_unlock(&dir->i_mutex); - goto found; + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} -need_revalidate: - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (nd->flags & LOOKUP_RCU) - goto done2; - goto done; +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; -fail: - return PTR_ERR(dentry); + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; } /* @@ -1340,30 +1402,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_reval; - - if (nd->depth) - lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); + return 0; /* At this point we know we have a real path component. */ for(;;) { - struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; + int type; nd->flags |= LOOKUP_CONTINUE; - if (nd->flags & LOOKUP_RCU) { - err = exec_permission(nd->inode, IPERM_FLAG_RCU); - if (err == -ECHILD) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto exec_again; - } - } else { -exec_again: - err = exec_permission(nd->inode, 0); - } + + err = may_lookup(nd); if (err) break; @@ -1379,56 +1429,43 @@ exec_again: this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } + /* remove trailing slashes? */ if (!c) goto last_component; while (*++name == '/'); if (!*name) - goto last_with_slashes; + goto last_component; - /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. - */ - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - continue; - } - /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - err = -ENOENT; - if (!inode) - goto out_dput; + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; - if (inode->i_op->follow_link) { - /* We commonly drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) - return -ECHILD; - BUG_ON(inode != next.dentry->d_inode); - err = do_follow_link(&next, nd); + if (err) { + err = nested_symlink(&next, nd); if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - err = -ENOENT; - if (!nd->inode) - break; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; + return err; } err = -ENOTDIR; if (!nd->inode->i_op->lookup) @@ -1436,209 +1473,109 @@ exec_again: continue; /* here ends the main loop */ -last_with_slashes: - lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) - goto lookup_parent; - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - goto return_reval; - } - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - if (inode && unlikely(inode->i_op->follow_link) && - (lookup_flags & LOOKUP_FOLLOW)) { - if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) - return -ECHILD; - BUG_ON(inode != next.dentry->d_inode); - err = do_follow_link(&next, nd); - if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; - } - err = -ENOENT; - if (!nd->inode) - break; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - } - goto return_base; -lookup_parent: nd->last = this; - nd->last_type = LAST_NORM; - if (this.name[0] != '.') - goto return_base; - if (this.len == 1) - nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') - nd->last_type = LAST_DOTDOT; - else - goto return_base; -return_reval: - /* - * We bypassed the ordinary revalidation routines. - * We may need to check the cached dentry for staleness. - */ - if (need_reval_dot(nd->path.dentry)) { - /* Note: we do not d_invalidate() */ - err = d_revalidate(nd->path.dentry, nd); - if (!err) - err = -ESTALE; - if (err < 0) - break; - } -return_base: - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; + nd->last_type = type; return 0; -out_dput: - if (!(nd->flags & LOOKUP_RCU)) - path_put_conditional(&next, nd); - break; } - if (!(nd->flags & LOOKUP_RCU)) - path_put(&nd->path); -return_err: + terminate_walk(nd); return err; } -static inline int path_walk_rcu(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static inline int path_walk_simple(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static int path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - current->total_link_count = 0; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - -static void path_finish_rcu(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (nd->file) - fput(nd->file); -} - -static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) { int retval = 0; int fput_needed; struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_RCU; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + nd->root.mnt = NULL; - nd->file = NULL; if (*name=='/') { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->path = nd->root; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } } else { struct dentry *dentry; - file = fget_light(dfd, &fput_needed); + file = fget_raw_light(dfd, &fput_needed); retval = -EBADF; if (!file) goto out_fail; dentry = file->f_path.dentry; - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } nd->path = file->f_path; - if (fput_needed) - nd->file = file; - - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + if (fput_needed) + *fp = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } } + nd->inode = nd->path.dentry->d_inode; return 0; @@ -1648,60 +1585,23 @@ out_fail: return retval; } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static inline int lookup_last(struct nameidata *nd, struct path *path) { - int retval = 0; - int fput_needed; - struct file *file; - - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; - nd->depth = 0; - nd->root.mnt = NULL; + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - if (*name=='/') { - set_root(nd); - nd->path = nd->root; - path_get(&nd->root); - } else if (dfd == AT_FDCWD) { - get_fs_pwd(current->fs, &nd->path); - } else { - struct dentry *dentry; - - file = fget_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; - - dentry = file->f_path.dentry; - - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; - - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; - - nd->path = file->f_path; - path_get(&file->f_path); - - fput_light(file, fput_needed); - } - nd->inode = nd->path.dentry->d_inode; - return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, +static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; + struct file *base = NULL; + struct path path; + int err; /* * Path walking is largely split up into 2 different synchronisation @@ -1717,44 +1617,75 @@ static int do_path_lookup(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init_rcu(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk_rcu(name, nd); - path_finish_rcu(nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); + + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } } - if (unlikely(retval == -ECHILD || retval == -ESTALE)) { - /* slower, locked walk */ - if (retval == -ESTALE) - flags |= LOOKUP_REVAL; - retval = path_init(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk(name, nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + if (nd->flags & LOOKUP_RCU) { + /* went all way through without dropping RCU */ + BUG_ON(err); + if (nameidata_drop_rcu_last(nd)) + err = -ECHILD; + } + + if (!err) + err = handle_reval_path(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + return -ENOTDIR; } } + if (base) + fput(base); + + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return err; +} + +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + if (likely(!retval)) { if (unlikely(!audit_dummy_context())) { if (nd->path.dentry && nd->inode) audit_inode(name, nd->path.dentry); } } - return retval; } -int path_lookup(const char *name, unsigned int flags, - struct nameidata *nd) +int kern_path_parent(const char *name, struct nameidata *nd) { - return do_path_lookup(AT_FDCWD, name, flags, nd); + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -1778,29 +1709,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; - - /* same as do_path_lookup */ - nd->last_type = LAST_ROOT; - nd->flags = flags; - nd->depth = 0; - - nd->path.dentry = dentry; - nd->path.mnt = mnt; - path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->inode)) - audit_inode(name, nd->path.dentry); - - path_put(&nd->root); - nd->root.mnt = NULL; - - return retval; + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); } static struct dentry *__lookup_hash(struct qstr *name, @@ -1815,17 +1727,6 @@ static struct dentry *__lookup_hash(struct qstr *name, return ERR_PTR(err); /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - err = base->d_op->d_hash(base, inode, name); - dentry = ERR_PTR(err); - if (err < 0) - goto out; - } - - /* * Don't bother with __d_lookup: callers are for creat as * well as unlink, so a lot of the time it would cost * a double lookup. @@ -1837,7 +1738,7 @@ static struct dentry *__lookup_hash(struct qstr *name, if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); -out: + return dentry; } @@ -1851,28 +1752,6 @@ static struct dentry *lookup_hash(struct nameidata *nd) return __lookup_hash(&nd->last, nd->path.dentry, nd); } -static int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) -{ - unsigned long hash; - unsigned int c; - - this->name = name; - this->len = len; - if (!len) - return -EACCES; - - hash = init_name_hash(); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return -EACCES; - hash = partial_name_hash(c, hash); - } - this->hash = end_name_hash(hash); - return 0; -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -1886,14 +1765,34 @@ static int __lookup_one_len(const char *name, struct qstr *this, */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { - int err; struct qstr this; + unsigned long hash; + unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); - err = __lookup_one_len(name, &this, base, len); - if (err) - return ERR_PTR(err); + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } return __lookup_hash(&this, base, NULL); } @@ -1902,7 +1801,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct nameidata nd; - char *tmp = getname(name); + char *tmp = getname_flags(name, flags); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -2082,12 +1981,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; + /* O_PATH? */ + if (!acc_mode) + return 0; + if (!inode) return -ENOENT; @@ -2156,34 +2059,6 @@ static int handle_truncate(struct file *filp) } /* - * Be careful about ever adding any more callers of this - * function. Its flags must be in the namei format, not - * what get passed to sys_open(). - */ -static int __open_namei_create(struct nameidata *nd, struct path *path, - int open_flag, int mode) -{ - int error; - struct dentry *dir = nd->path.dentry; - - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = path->dentry; - - if (error) - return error; - /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, open_flag & ~O_TRUNC); -} - -/* * Note that while the flag value (low two bits) for sys_open means: * 00 - read-only * 01 - write-only @@ -2207,128 +2082,115 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_truncate(int flag, struct inode *inode) -{ - /* - * We'll never write to the fs underlying - * a device file. - */ - if (special_file(inode->i_mode)) - return 0; - return (flag & O_TRUNC); -} - -static struct file *finish_open(struct nameidata *nd, - int open_flag, int acc_mode) -{ - struct file *filp; - int will_truncate; - int error; - - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit; - } - error = may_open(&nd->path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - return filp; - -exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); - path_put(&nd->path); - return ERR_PTR(error); -} - /* - * Handle O_CREAT case for do_filp_open + * Handle the last step of open() */ static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int acc_mode, - int mode, const char *pathname) + const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + struct dentry *dentry; + int open_flag = op->open_flag; + int will_truncate = open_flag & O_TRUNC; + int want_write = 0; + int acc_mode = op->acc_mode; struct file *filp; - int error = -EISDIR; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; switch (nd->last_type) { case LAST_DOTDOT: - follow_dotdot(nd); - dir = nd->path.dentry; case LAST_DOT: - if (need_reval_dot(dir)) { - int status = d_revalidate(nd->path.dentry, nd); - if (!status) - status = -ESTALE; - if (status < 0) { - error = status; - goto exit; - } - } + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - goto exit; + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + error = handle_reval_path(nd); + if (error) + goto exit; + audit_inode(pathname, nd->path.dentry); + if (open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; case LAST_BIND: + /* can't be RCU mode here */ + error = handle_reval_path(nd); + if (error) + goto exit; audit_inode(pathname, dir); goto ok; } + if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; + /* we _can_ be in RCU mode here */ + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) + return ERR_PTR(error); + if (error) /* symlink */ + return NULL; + /* sayonara */ + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + audit_inode(pathname, dir); + error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto exit; mutex_lock(&dir->d_inode->i_mutex); - path->dentry = lookup_hash(nd); - path->mnt = nd->path.mnt; - - error = PTR_ERR(path->dentry); - if (IS_ERR(path->dentry)) { + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd->intent.open.file)) { - error = PTR_ERR(nd->intent.open.file); - goto exit_mutex_unlock; - } + path->dentry = dentry; + path->mnt = nd->path.mnt; /* Negative dentry, just create the file */ - if (!path->dentry->d_inode) { + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); /* * This write is needed to ensure that a - * ro->rw transition does not occur between + * rw->ro transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). @@ -2336,22 +2198,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, open_flag, mode); - if (error) { - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; + want_write = 1; + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = 0; + acc_mode = MAY_OPEN; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; + goto common; } /* @@ -2381,7 +2242,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, open_flag, acc_mode); + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + want_write = 1; + } +common: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto exit; + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } +out: + if (want_write) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit_mutex_unlock: @@ -2389,199 +2283,103 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); - path_put(&nd->path); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *path_openat(int dfd, const char *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) { + struct file *base = NULL; struct file *filp; - struct nameidata nd; - int error; struct path path; - int count = 0; - int flag = open_to_namei_flags(open_flag); - int flags; - - if (!(open_flag & O_CREAT)) - mode = 0; - - /* Must never be set by userspace */ - open_flag &= ~FMODE_NONOTIFY; - - /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. - */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); - - /* O_TRUNC implies we need access checks for write permissions */ - if (open_flag & O_TRUNC) - acc_mode |= MAY_WRITE; - - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (open_flag & O_APPEND) - acc_mode |= MAY_APPEND; - - flags = LOOKUP_OPEN; - if (open_flag & O_CREAT) { - flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - flags |= LOOKUP_FOLLOW; + int error; filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); - filp->f_flags = open_flag; - nd.intent.open.file = filp; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; + filp->f_flags = op->open_flag; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; - if (open_flag & O_CREAT) - goto creat; - - /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags, &nd); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) { - if (nd.inode->i_op->follow_link) - goto out_path; - } - error = -ENOTDIR; - if (nd.flags & LOOKUP_DIRECTORY) { - if (!nd.inode->i_op->lookup) - goto out_path; - } - audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, open_flag, acc_mode); - return filp; -creat: - /* OK, have to create the file. Find the parent. */ - error = path_init_rcu(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - error = path_walk_rcu(pathname, &nd); - path_finish_rcu(&nd); - if (unlikely(error == -ECHILD || error == -ESTALE)) { - /* slower, locked walk */ - if (error == -ESTALE) { -reval: - flags |= LOOKUP_REVAL; - } - error = path_init(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - - error = path_walk_simple(pathname, &nd); - } + current->total_link_count = 0; + error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. - */ - nd.flags = flags; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; - struct inode *linki = link.dentry->d_inode; void *cookie; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) - goto exit_dput; - if (count++ == 32) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do - * the thing by hands. The reason is that this way we have zero - * link_count and path_walk() (called from ->follow_link) - * honoring LOOKUP_PARENT. After that we have the parent and - * last component, i.e. we are in the same situation as after - * the first path_walk(). Well, almost - if the last component - * is normal we get its copy stored in nd->last.name and we will - * have to putname() it when we are done. Procfs-like symlinks - * just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(link.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&link, &nd, &cookie); - if (unlikely(error)) { - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - /* nd.path had been dropped */ - nd.path = link; - goto out_path; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + filp = ERR_PTR(-ELOOP); + break; } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - path_put(&link); + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + filp = ERR_PTR(error); + else + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); } out: - if (nd.root.mnt) - path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) - goto reval; + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); + if (base) + fput(base); + release_open_intent(nd); return filp; -exit_dput: - path_put_conditional(&path, &nd); -out_path: - path_put(&nd.path); out_filp: - if (!IS_ERR(nd.intent.open.file)) - release_open_intent(&nd); filp = ERR_PTR(error); goto out; } -/** - * filp_open - open file and return file pointer - * - * @filename: path to open - * @flags: open flags as per the open(2) second argument - * @mode: mode for the new file if O_CREAT is set, else ignored - * - * This is the helper to open a file from kernelspace if you really - * have to. But in generally you should not do this, so please move - * along, nothing to see here.. - */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) { - return do_filp_open(AT_FDCWD, filename, flags, mode, 0); + struct nameidata nd; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; } -EXPORT_SYMBOL(filp_open); /** * lookup_create - lookup a dentry, creating it if it doesn't exist @@ -3120,7 +2918,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - error = dir->i_op->link(old_dentry, dir, new_dentry); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); @@ -3142,15 +2944,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, struct dentry *new_dentry; struct nameidata nd; struct path old_path; + int how = 0; int error; char *to; - if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; - error = user_path_at(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_path); + error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; @@ -3587,7 +3401,7 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); diff --git a/fs/namespace.c b/fs/namespace.c index 7b0b9537169..e96e03782de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int uuid_is_nil(u8 *uuid) +{ + int i; + u8 *cp = (u8 *)uuid; + + for (i = 0; i < 16; i++) { + if (*cp++) + return 0; + } + return 1; +} + static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); + if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) + /* print the uuid */ + seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); + /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); @@ -1244,7 +1260,7 @@ static int do_umount(struct vfsmount *mnt, int flags) */ br_write_lock(vfsmount_lock); if (mnt_get_count(mnt) != 2) { - br_write_lock(vfsmount_lock); + br_write_unlock(vfsmount_lock); return -EBUSY; } br_write_unlock(vfsmount_lock); @@ -1767,6 +1783,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + err = security_sb_remount(sb, data); + if (err) + return err; + down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1cc600e77bb..01768e5e2c9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -37,6 +37,7 @@ #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word) */ u64 nfs_compat_user_ino64(u64 fileid) { - int ino; +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif if (enable_ino64) return fileid; @@ -1513,7 +1518,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7a747407314..1be36cf65bf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +extern void nfs4_schedule_session_recovery(struct nfs4_session *); +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); @@ -307,10 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); -extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); -extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f5c9b125e8c..b73c34375f6 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf = kmalloc(rlen + 1, GFP_KERNEL); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_err; + } buf[rlen] = '\0'; memcpy(buf, r_addr, rlen); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 78936a8f40a..0a07e353a96 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -256,12 +256,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -272,7 +273,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -295,8 +296,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } /* We failed to handle the error */ return nfs4_map_errors(ret); -do_state_recovery: - nfs4_schedule_state_recovery(clp); +wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); if (ret == 0) exception->retry = 1; @@ -435,8 +435,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * clp = res->sr_session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ - if (atomic_read(&clp->cl_count) > 1) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); break; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -1255,14 +1255,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery( - server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_lease_recovery(server->nfs_client); goto out; case -ERESTARTSYS: /* @@ -1271,7 +1270,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* * User RPCSEC_GSS context has expired. @@ -1587,7 +1586,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); ret = -EIO; } return ret; @@ -3178,7 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); return; } do_renew_lease(clp, timestamp); @@ -3252,6 +3251,35 @@ static void buf_to_pages(const void *buf, size_t buflen, } } +static int buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; + + do { + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; +} + struct nfs4_cached_acl { int cached; size_t len; @@ -3420,13 +3448,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl .rpc_argp = &arg, .rpc_resp = &res, }; - int ret; + int ret, i; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; nfs_inode_return_delegation(inode); - buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + /* * Acl update can result in inode attribute update. * so mark the attribute cache invalid. @@ -3464,12 +3502,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3480,7 +3519,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3497,9 +3536,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -do_state_recovery: +wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; @@ -4110,7 +4148,7 @@ static void nfs4_lock_release(void *calldata) task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); dprintk("%s: cancelling lock!\n", __func__); } else nfs_free_seqid(data->arg.lock_seqid); @@ -4134,23 +4172,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = { static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_state *state = lsp->ls_state; - switch (error) { case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_nograce(clp, state); - lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: - if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_reboot(clp, state); lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -4366,12 +4399,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -ERESTARTSYS: /* @@ -4381,7 +4416,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; case -EKEYEXPIRED: @@ -4988,10 +5023,20 @@ int nfs4_proc_create_session(struct nfs_client *clp) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + long timeout = 0; + int err; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - status = _nfs4_proc_create_session(clp); + do { + status = _nfs4_proc_create_session(clp); + if (status == -NFS4ERR_DELAY) { + err = nfs4_delay(clp->cl_rpcclient, &timeout); + if (err) + status = err; + } + } while (status == -NFS4ERR_DELAY); + if (status) goto out; @@ -5100,7 +5145,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5187,7 +5232,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if (IS_ERR(task)) ret = PTR_ERR(task); else - rpc_put_task(task); + rpc_put_task_async(task); dprintk("<-- %s status=%d\n", __func__, ret); return ret; } @@ -5203,8 +5248,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ret = task->tk_status; + } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -5241,7 +5291,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5309,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) status = PTR_ERR(task); goto out; } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; rpc_put_task(task); return 0; out: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e6742b57a04..0592288f9f0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) } /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp) { if (!clp) return; @@ -1018,7 +1018,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st return 1; } -int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s return 1; } +void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + nfs4_state_mark_reclaim_nograce(clp, state); + nfs4_schedule_state_manager(clp); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; @@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ + nfs4_schedule_lease_recovery(session->clp); +} + void nfs41_handle_recall_slot(struct nfs_client *clp) { set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } static void nfs4_reset_all_state(struct nfs_client *clp) @@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { clp->cl_boot_time = CURRENT_TIME; nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) { nfs_expire_all_delegations(clp); if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e2c168b6ee..94d50e86a12 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr, p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); *p++ = cpu_to_be32(OP_CREATE_SESSION); - p = xdr_encode_hyper(p, clp->cl_ex_clid); + p = xdr_encode_hyper(p, clp->cl_clientid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ @@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - xdr_decode_hyper(p, &clp->cl_ex_clid); + xdr_decode_hyper(p, &clp->cl_clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 903908a2002..c541093a5bf 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,11 +86,14 @@ /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "udp" + /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; /* Text-based mount options passed to super.c */ -static char nfs_root_options[256] __initdata = ""; +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; /* Address of NFS server */ static __be32 servaddr __initdata = htonl(INADDR_NONE); @@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src, } static int __init root_nfs_cat(char *dest, const char *src, - const size_t destlen) + const size_t destlen) { + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + if (strlcat(dest, src, destlen) > destlen) return -1; return 0; @@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, if (root_nfs_cat(nfs_root_options, incoming, sizeof(nfs_root_options))) return -1; - - /* - * Possibly prepare for more options to be appended - */ - if (nfs_root_options[0] != '\0' && - nfs_root_options[strlen(nfs_root_options)] != ',') - if (root_nfs_cat(nfs_root_options, ",", - sizeof(nfs_root_options))) - return -1; - return 0; } @@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, */ static int __init root_nfs_data(char *cmdline) { - char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; int len, retval = -1; char *tmp = NULL; const size_t tmplen = sizeof(nfs_export_path); @@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline) * Append mandatory options for nfsroot so they override * what has come before */ - snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", &servaddr); - if (root_nfs_cat(nfs_root_options, addr_option, + if (root_nfs_cat(nfs_root_options, mand_options, sizeof(nfs_root_options))) goto out_optionstoolong; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e313a51acdd..6481d537d69 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); return 1; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c8278f4046c..42b92d7a9cc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } diff --git a/fs/nfsctl.c b/fs/nfsctl.c index bf9cbd242dd..124e8fcb0dd 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -22,30 +22,17 @@ static struct file *do_open(char *name, int flags) { - struct nameidata nd; struct vfsmount *mnt; - int error; + struct file *file; mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); if (IS_ERR(mnt)) return (struct file *)mnt; - error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); - mntput(mnt); /* drop do_kern_mount reference */ - if (error) - return ERR_PTR(error); - - if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); - else - error = may_open(&nd.path, MAY_WRITE, flags); + file = file_open_root(mnt->mnt_root, mnt, name, flags); - if (!error) - return dentry_open(nd.path.dentry, nd.path.mnt, flags, - current_cred()); - - path_put(&nd.path); - return ERR_PTR(error); + mntput(mnt); /* drop do_kern_mount reference */ + return file; } static struct { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3be975e1891..02eb4edf0ec 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); @@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, out: return status; out_default: - return nfs_cb_stat_to_errno(status); + return nfs_cb_stat_to_errno(nfserr); } /* @@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) goto out; if (unlikely(nfserr != NFS4_OK)) - goto out_default; + status = nfs_cb_stat_to_errno(nfserr); out: return status; -out_default: - return nfs_cb_stat_to_errno(status); } /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d98d0213285..7b566ec14e1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; - dp->dl_vfs_file = find_readable_file(fp); - get_file(dp->dl_vfs_file); - dp->dl_flock = NULL; dp->dl_type = type; dp->dl_stateid.si_boot = boot_time; dp->dl_stateid.si_stateownerid = current_delegid++; @@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); - list_add(&dp->dl_perfile, &fp->fi_delegations); - list_add(&dp->dl_perclnt, &clp->cl_delegations); INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); return dp; } @@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp) if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); - fput(dp->dl_vfs_file); kmem_cache_free(deleg_slab, dp); num_delegations--; } } -/* Remove the associated file_lock first, then remove the delegation. - * lease_modify() is called to remove the FS_LEASE file_lock from - * the i_flock list, eventually calling nfsd's lock_manager - * fl_release_callback. - */ -static void -nfs4_close_delegation(struct nfs4_delegation *dp) +static void nfs4_put_deleg_lease(struct nfs4_file *fp) { - dprintk("NFSD: close_delegation dp %p\n",dp); - /* XXX: do we even need this check?: */ - if (dp->dl_flock) - vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); + if (atomic_dec_and_test(&fp->fi_delegees)) { + vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); + fp->fi_lease = NULL; + fp->fi_deleg_file = NULL; + } } /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { - list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); + list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); - nfs4_close_delegation(dp); + nfs4_put_deleg_lease(dp->dl_file); nfs4_put_delegation(dp); } @@ -958,8 +947,6 @@ expire_client(struct nfs4_client *clp) spin_lock(&recall_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - dprintk("NFSD: expire client. dp %p, fp %p\n", dp, - dp->dl_flock); list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } @@ -2078,6 +2065,7 @@ alloc_init_file(struct inode *ino) fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; + fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); @@ -2329,23 +2317,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) nfs4_file_put_access(fp, O_RDONLY); } -/* - * Spawn a thread to perform a recall on the delegation represented - * by the lease (file_lock) - * - * Called from break_lease() with lock_flocks() held. - * Note: we assume break_lease will only call this *once* for any given - * lease. - */ -static -void nfsd_break_deleg_cb(struct file_lock *fl) +static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; - - dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl); - if (!dp) - return; - /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2353,22 +2326,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl) * it's safe to take a reference: */ atomic_inc(&dp->dl_count); - spin_lock(&recall_lock); list_add_tail(&dp->dl_recall_lru, &del_recall_lru); - spin_unlock(&recall_lock); /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); + nfsd4_cb_recall(dp); +} + +/* Called from break_lease() with lock_flocks() held. */ +static void nfsd_break_deleg_cb(struct file_lock *fl) +{ + struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; + struct nfs4_delegation *dp; + + BUG_ON(!fp); + /* We assume break_lease is only called once per lease: */ + BUG_ON(fp->fi_had_conflict); /* * We don't want the locks code to timeout the lease for us; - * we'll remove it ourself if the delegation isn't returned - * in time. + * we'll remove it ourself if a delegation isn't returned + * in time: */ fl->fl_break_time = 0; - dp->dl_file->fi_had_conflict = true; - nfsd4_cb_recall(dp); + spin_lock(&recall_lock); + fp->fi_had_conflict = true; + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + nfsd_break_one_deleg(dp); + spin_unlock(&recall_lock); } static @@ -2461,10 +2447,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) { struct nfs4_delegation *dp; - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) + spin_lock(&recall_lock); + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + spin_unlock(&recall_lock); return dp; - } + } + spin_unlock(&recall_lock); return NULL; } @@ -2641,6 +2630,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + return NULL; + locks_init_lock(fl); + fl->fl_lmops = &nfsd_lease_mng_ops; + fl->fl_flags = FL_LEASE; + fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = (fl_owner_t)(dp->dl_file); + fl->fl_pid = current->tgid; + return fl; +} + +static int nfs4_setlease(struct nfs4_delegation *dp, int flag) +{ + struct nfs4_file *fp = dp->dl_file; + struct file_lock *fl; + int status; + + fl = nfs4_alloc_init_lease(dp, flag); + if (!fl) + return -ENOMEM; + fl->fl_file = find_readable_file(fp); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); + if (status) { + list_del_init(&dp->dl_perclnt); + locks_free_lock(fl); + return -ENOMEM; + } + fp->fi_lease = fl; + fp->fi_deleg_file = fl->fl_file; + get_file(fp->fi_deleg_file); + atomic_set(&fp->fi_delegees, 1); + list_add(&dp->dl_perfile, &fp->fi_delegations); + return 0; +} + +static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) +{ + struct nfs4_file *fp = dp->dl_file; + + if (!fp->fi_lease) + return nfs4_setlease(dp, flag); + spin_lock(&recall_lock); + if (fp->fi_had_conflict) { + spin_unlock(&recall_lock); + return -EAGAIN; + } + atomic_inc(&fp->fi_delegees); + list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_unlock(&recall_lock); + list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations); + return 0; +} + /* * Attempt to hand out a delegation. */ @@ -2650,7 +2699,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; int cb_up; - struct file_lock *fl; int status, flag = 0; cb_up = nfsd4_cb_channel_good(sop->so_client); @@ -2681,36 +2729,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta } dp = alloc_init_deleg(sop->so_client, stp, fh, flag); - if (dp == NULL) { - flag = NFS4_OPEN_DELEGATE_NONE; - goto out; - } - status = -ENOMEM; - fl = locks_alloc_lock(); - if (!fl) - goto out; - locks_init_lock(fl); - fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_LEASE; - fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_file = find_readable_file(stp->st_file); - BUG_ON(!fl->fl_file); - fl->fl_pid = current->tgid; - dp->dl_flock = fl; - - /* vfs_setlease checks to see if delegation should be handed out. - * the lock_manager callback fl_change is used - */ - if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { - dprintk("NFSD: setlease failed [%d], no delegation\n", status); - dp->dl_flock = NULL; - locks_free_lock(fl); - unhash_delegation(dp); - flag = NFS4_OPEN_DELEGATE_NONE; - goto out; - } + if (dp == NULL) + goto out_no_deleg; + status = nfs4_set_delegation(dp, flag); + if (status) + goto out_free; memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); @@ -2722,6 +2745,12 @@ out: && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) dprintk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_delegate_type = flag; + return; +out_free: + nfs4_put_delegation(dp); +out_no_deleg: + flag = NFS4_OPEN_DELEGATE_NONE; + goto out; } /* @@ -2916,8 +2945,6 @@ nfs4_laundromat(void) test_val = u; break; } - dprintk("NFSD: purging unused delegation dp %p, fp %p\n", - dp, dp->dl_flock); list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); @@ -3128,7 +3155,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, goto out; renew_client(dp->dl_client); if (filpp) { - *filpp = find_readable_file(dp->dl_file); + *filpp = dp->dl_file->fi_deleg_file; BUG_ON(!*filpp); } } else { /* open or lock stateid */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 956629b9cdc..615f0a9f060 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { @@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { @@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, u32 dummy; char *machine_name; - int i; + int i, j; int nr_secflavs; READ_BUF(16); @@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); - for (i = 0; i < dummy; ++i) + for (j = 0; j < dummy; ++j) READ32(dummy); break; case RPC_AUTH_GSS: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 3074656ba7b..2d31224b07b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -83,8 +83,6 @@ struct nfs4_delegation { atomic_t dl_count; /* ref count */ struct nfs4_client *dl_client; struct nfs4_file *dl_file; - struct file *dl_vfs_file; - struct file_lock *dl_flock; u32 dl_type; time_t dl_time; /* For recall: */ @@ -379,6 +377,9 @@ struct nfs4_file { */ atomic_t fi_readers; atomic_t fi_writers; + struct file *fi_deleg_file; + struct file_lock *fi_lease; + atomic_t fi_delegees; struct inode *fi_inode; u32 fi_id; /* used with stateowner->so_id * for stateid_hashtbl hash */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 641117f2188..da1d9701f8e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino) if (ra->p_count == 0) frap = rap; } - depth = nfsdstats.ra_size*11/10; + depth = nfsdstats.ra_size; if (!frap) { spin_unlock(&rab->pb_lock); return NULL; @@ -1744,6 +1744,13 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = nfsd_break_lease(odentry->d_inode); if (host_err) goto out_drop_write; + if (ndentry->d_inode) { + host_err = nfsd_break_lease(ndentry->d_inode); + if (host_err) + goto out_drop_write; + } + if (host_err) + goto out_drop_write; host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { host_err = commit_metadata(tfhp); @@ -1812,22 +1819,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); if (host_err) - goto out_nfserr; + goto out_put; host_err = nfsd_break_lease(rdentry->d_inode); if (host_err) - goto out_put; + goto out_drop_write; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); else host_err = vfs_rmdir(dirp, rdentry); -out_put: - dput(rdentry); - if (!host_err) host_err = commit_metadata(fhp); - +out_drop_write: mnt_drop_write(fhp->fh_export->ex_path.mnt); +out_put: + dput(rdentry); + out_nfserr: err = nfserrno(host_err); out: diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 388e9e8f528..85f7baa15f5 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,11 +35,6 @@ #include "btnode.h" -void nilfs_btnode_cache_init_once(struct address_space *btnc) -{ - nilfs_mapping_init_once(btnc); -} - static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 79037494f1e..1b8ebd888c2 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 6a0e2a189f6..a0babd2bff6 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); - nilfs_mapping_init_once(&shadow->frozen_data); + address_space_init_once(&shadow->frozen_data); nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); - nilfs_mapping_init_once(&shadow->frozen_btnodes); + address_space_init_once(&shadow->frozen_btnodes); nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); mi->mi_shadow = shadow; return 0; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 98034271cd0..161791d2645 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inc_nlink(old_inode); nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); new_inode->i_ctime = CURRENT_TIME; @@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= NILFS_LINK_MAX) goto out_dir; } - inc_nlink(old_inode); err = nilfs_add_link(new_dentry, old_inode); - if (err) { - drop_nlink(old_inode); - nilfs_mark_inode_dirty(old_inode); + if (err) goto out_dir; - } if (dir_de) { inc_nlink(new_dir); nilfs_mark_inode_dirty(new_dir); @@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = CURRENT_TIME; nilfs_delete_entry(old_de, old_page); - drop_nlink(old_inode); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 0c432416cfe..a585b35fd6b 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init_once(struct address_space *mapping) -{ - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - - spin_lock_init(&mapping->i_mmap_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); -} - void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 622df27cd89..2a00953ebd5 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init_once(struct address_space *mapping); void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 55ebae5c7f3..2de9f636792 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, nilfs_segctor_map_segsum_entry( sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); - if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + if (NILFS_I(inode)->i_root && + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); /* skip finfo */ } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 58fd707174e..1673b3d9984 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + address_space_init_once(&ii->i_btnode_cache); ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 6d80ecc7834..7eb90403fc8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, int ret = 0; /* if all else fails, just return false */ struct ocfs2_super *osb; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 5dbc3062b4f..254652a9b54 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); - if (len < 3 || (connectable && len < 6)) { - mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; type = 255; goto bail; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 43e56b97f9c..6180da1e37e 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) ocfs2_quota_trans_credits(sb); } -/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) dx_root update for free list */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) static inline int ocfs2_add_dir_index_credits(struct super_block *sb) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 849fb4a2e81..d6c25d76b53 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -293,7 +293,7 @@ static int ocfs2_mknod(struct inode *dir, } /* get security xattr */ - status = ocfs2_init_security_get(inode, dir, &si); + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); if (status) { if (status == -EOPNOTSUPP) si.enable = 0; @@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir, } /* get security xattr */ - status = ocfs2_init_security_get(inode, dir, &si); + status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si); if (status) { if (status == -EOPNOTSUPP) si.enable = 0; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 196fcb52d95..d5ab56cbe5c 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; -int ocfs2_quota_setup(void); -void ocfs2_quota_shutdown(void); - #endif /* _OCFS2_QUOTA_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 4607923eb24..a73f6416648 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -63,8 +63,6 @@ * write to gf */ -static struct workqueue_struct *ocfs2_quota_wq = NULL; - static void qsync_work_fn(struct work_struct *work); static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) @@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type) OCFS2_QBLK_RESERVED_SPACE; oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); - queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - msecs_to_jiffies(oinfo->dqi_syncms)); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work) struct super_block *sb = oinfo->dqi_gqinode->i_sb; dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); - queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - msecs_to_jiffies(oinfo->dqi_syncms)); + schedule_delayed_work(&oinfo->dqi_sync_work, + msecs_to_jiffies(oinfo->dqi_syncms)); } /* @@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = { .alloc_dquot = ocfs2_alloc_dquot, .destroy_dquot = ocfs2_destroy_dquot, }; - -int ocfs2_quota_setup(void) -{ - ocfs2_quota_wq = create_workqueue("o2quot"); - if (!ocfs2_quota_wq) - return -ENOMEM; - return 0; -} - -void ocfs2_quota_shutdown(void) -{ - if (ocfs2_quota_wq) { - flush_workqueue(ocfs2_quota_wq); - destroy_workqueue(ocfs2_quota_wq); - ocfs2_quota_wq = NULL; - } -} diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b5f9160e93e..c384d634872 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; - u32 new_bit, new_len; + u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; @@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, goto out; } + orig_num_clusters = num_clusters; + while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, @@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + ret = ocfs2_cow_sync_writeback(sb, context, cpos, + orig_num_clusters); if (ret) mlog_errno(ret); } @@ -4325,7 +4328,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode); + error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + &new_dentry->d_name); if (error) mlog_errno(error); } @@ -4376,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path, if (IS_ERR(s)) return PTR_ERR(s); - error = path_lookup(s, LOOKUP_PARENT, nd); + error = kern_path_parent(s, nd); if (error) putname(s); else diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 38f986d2447..236ed1bdca2 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb, struct mount_options *mopt, int is_remount) { - int status; + int status, user_stack = 0; char *p; u32 tmp; @@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb, memcpy(mopt->cluster_stack, args[0].from, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; break; case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; @@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb, } } - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } } status = 1; @@ -1645,16 +1657,11 @@ static int __init ocfs2_init(void) mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); } - status = ocfs2_quota_setup(); - if (status) - goto leave; - ocfs2_set_locking_protocol(); status = register_quota_format(&ocfs2_quota_format); leave: if (status < 0) { - ocfs2_quota_shutdown(); ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); } @@ -1671,8 +1678,6 @@ static void __exit ocfs2_exit(void) { mlog_entry_void(); - ocfs2_quota_shutdown(); - if (ocfs2_wq) { flush_workqueue(ocfs2_wq); destroy_workqueue(ocfs2_wq); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 67cd4391464..6bb602486c6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7185,7 +7185,8 @@ out: * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, - struct inode *inode) + struct inode *inode, + const struct qstr *qstr) { int ret = 0; struct buffer_head *dir_bh = NULL; @@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir, .enable = 1, }; - ret = ocfs2_init_security_get(inode, dir, &si); + ret = ocfs2_init_security_get(inode, dir, qstr, &si); if (!ret) { ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, si.name, si.value, si.value_len, @@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, int ocfs2_init_security_get(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - return security_inode_init_security(inode, dir, &si->name, &si->value, - &si->value_len); + return security_inode_init_security(inode, dir, qstr, &si->name, + &si->value, &si->value_len); } int ocfs2_init_security_set(handle_t *handle, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index aa64bb37a65..d63cfb72316 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode, struct ocfs2_dinode *di); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); int ocfs2_init_security_get(struct inode *, struct inode *, + const struct qstr *, struct ocfs2_security_xattr_info *); int ocfs2_init_security_set(handle_t *, struct inode *, struct buffer_head *, @@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, - struct inode *inode); + struct inode *inode, + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/open.c b/fs/open.c index e52389e1f05..f83ca80cc59 100644 --- a/fs/open.c +++ b/fs/open.c @@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(file->f_mode & FMODE_WRITE)) return -EBADF; + + /* It's not possible punch hole on append only file */ + if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. @@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int (*open)(struct inode *, struct file *), const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); @@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; - f->f_op = fops_get(inode->i_fop); file_sb_list_add(f, inode->i_sb); + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + error = security_dentry_open(f, cred); if (error) goto cleanup_all; @@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; } - ima_counts_get(f); + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(inode); f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -790,6 +812,8 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; + nd->intent.open.file = NULL; + /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) { path_get(&nd->path); @@ -880,15 +904,110 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); + struct file *f = do_filp_open(dfd, tmp, &op, lookup); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); @@ -958,8 +1077,10 @@ int filp_close(struct file *filp, fl_owner_t id) if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 789c625c7aa..b10e3540d5b 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) } vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index 68d6a216ee7..11f688bd76c 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len) int mac_partition(struct parsed_partitions *state) { - int slot = 1; Sector sect; unsigned char *data; - int blk, blocks_in_map; + int slot, blocks_in_map; unsigned secsize; #ifdef CONFIG_PPC_PMAC int found_root = 0; @@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; /* not a MacOS disk */ } - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); blocks_in_map = be32_to_cpu(part->map_count); - for (blk = 1; blk <= blocks_in_map; ++blk) { - int pos = blk * secsize; + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, §); if (!data) @@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state) } if (goodness > found_root_goodness) { - found_root = blk; + found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ - - ++slot; } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index 48cec7cbca1..764b86a0196 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,10 +10,13 @@ #include "check.h" #include "osf.h" +#define MAX_OSF_PARTITIONS 18 + int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; + unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { @@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state) u8 p_fstype; u8 p_frag; __le16 p_cpg; - } d_partitions[8]; + } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; @@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; } - for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) diff --git a/fs/proc/array.c b/fs/proc/array.c index df2b703b9d0..7c99c1cf7e5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cap(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); -#if defined(CONFIG_S390) - task_show_regs(m, task); -#endif task_context_switch_counts(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d096e82b20..d49c4b5d2c3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = { &proc_self_inode_operations, NULL, {}), }; -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode; - struct task_struct *task; - - if (nd->flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - if (task) { - put_task_struct(task); - return 1; - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations proc_base_dentry_operations = -{ - .d_revalidate = proc_base_revalidate, - .d_delete = pid_delete_dentry, -}; - static struct dentry *proc_base_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 176ce4cda68..d6a7ca1fdac 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -27,6 +27,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; + struct ctl_table_header *head; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode) de = PROC_I(inode)->pde; if (de) pde_put(de); - if (PROC_I(inode)->sysctl) - sysctl_head_put(PROC_I(inode)->sysctl); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } } struct vfsmount *proc_mnt; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index d9396a4fc7f..927cbd115e5 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -233,7 +233,7 @@ void __init proc_device_tree_init(void) return; root = of_find_node_by_path("/"); if (root == NULL) { - printk(KERN_ERR "/proc/device-tree: can't find root\n"); + pr_debug("/proc/device-tree: can't find root\n"); return; } proc_device_tree_add_node(root, proc_device_tree); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 09a1f92a34e..f50133c11c2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, ei->sysctl_entry = table; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ inode->i_mode = table->mode; if (!table->child) { inode->i_mode |= S_IFREG; @@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + struct ctl_table_header *head; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ + /* AV: can it, indeed? */ if (!inode) - return 0; + return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(inode)->sysctl); + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0bae036831e..1bba24bad82 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (maxlen < 3) + if (need_parent && (maxlen < 5)) { + *lenp = 5; return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } data[0] = inode->i_ino; data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 3eea859e699..c77514bd577 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_mounted_fs_count++; if (reiserfs_mounted_fs_count <= 1) { reiserfs_write_unlock(sb); - commit_wq = create_workqueue("reiserfs"); + commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); reiserfs_write_lock(sb); } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ba5f51ec345..118662690cd 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) new_inode_init(inode, dir, mode); jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &security); + retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, dentry, inode, &security); if (retval) { - dir->i_nlink--; + DEC_DIR_INODE_NLINK(dir) goto out_failed; } @@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir, } new_inode_init(inode, parent_dir, mode); - retval = reiserfs_security_init(parent_dir, inode, &security); + retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, + &security); if (retval < 0) { drop_new_inode(inode); return retval; @@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_unlock(dir->i_sb); return -EMLINK; } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } /* inc before scheduling so reiserfs_unlink knows we are here */ inc_nlink(inode); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 3cfb2e93364..5c11ca82b78 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) - return -ECHILD; return -EPERM; } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 237c6928d3c..ef66c18a933 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len, * of blocks needed for the transaction. If successful, reiserfs_security * must be released using reiserfs_security_free when the caller is done. */ int reiserfs_security_init(struct inode *dir, struct inode *inode, + const struct qstr *qstr, struct reiserfs_security_handle *sec) { int blocks = 0; @@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_inode_init_security(inode, dir, &sec->name, + error = security_inode_init_security(inode, dir, qstr, &sec->name, &sec->value, &sec->length); if (error) { if (error == -EOPNOTSUPP) diff --git a/fs/stat.c b/fs/stat.c index d5c61cf2b70..961039121cb 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_NO_AUTOMOUNT) lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, 0, &path); + error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); if (!error) { struct inode *inode = path.dentry->d_inode; diff --git a/fs/statfs.c b/fs/statfs.c index 30ea8c8a996..8244924dec5 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf) } EXPORT_SYMBOL(vfs_statfs); -static int do_statfs_native(struct path *path, struct statfs *buf) +int user_statfs(const char __user *pathname, struct kstatfs *st) { - struct kstatfs st; - int retval; + struct path path; + int error = user_path(pathname, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} - retval = vfs_statfs(path, &st); - if (retval) - return retval; +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } -static int do_statfs64(struct path *path, struct statfs64 *buf) +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = do_statfs_native(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { - struct path path; - long error; - + struct kstatfs st; + int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = do_statfs64(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { - struct file *file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_native(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { - struct file *file; - struct statfs64 tmp; + struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs64(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } diff --git a/fs/super.c b/fs/super.c index 74e149efed8..7e9dd4cc2c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { fs->kill_sb(s); + /* + * We need to call rcu_barrier so all the delayed rcu free + * inodes are flushed before we release the fs module. + */ + rcu_barrier(); put_filesystem(fs); put_super(s); } else { diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b427b1208c2..e474fbcf8bd 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, new_de = sysv_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); sysv_set_link(new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) goto out_dir; } - inode_inc_link_count(old_inode); err = sysv_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } sysv_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { sysv_set_link(dir_de, dir_page, new_dir); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 14f64b689d7..7217d67a80a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - * - * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and - * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not - * lock 'dirA->i_mutex', so this is possible. Both of the functions - * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes - * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this - * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' - * to the list of orphans. After this, 'vfs_link()' will link - * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, - * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode - * to the list of orphans. - */ - if (inode->i_nlink == 0) - return -ENOENT; - err = dbg_check_synced_i_size(inode); if (err) return err; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2be0f9eb86d..f1dce848ef9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -32,6 +32,8 @@ #include <linux/crc-itu-t.h> #include <linux/exportfs.h> +enum { UDF_MAX_LINKS = 0xffff }; + static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) { @@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *iinfo; err = -EMLINK; - if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) + if (dir->i_nlink >= UDF_MAX_LINKS) goto out; err = -EIO; @@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct fileIdentDesc cfi, *fi; int err; - if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { + if (inode->i_nlink >= UDF_MAX_LINKS) return -EMLINK; - } fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { @@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; retval = -EMLINK; - if (!new_inode && - new_dir->i_nlink >= - (256 << sizeof(new_dir->i_nlink)) - 1) + if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS) goto end_rename; } if (!nfi) { @@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *lenp = 5; return 255; + } else if (len < 3) { + *lenp = 3; + return 255; + } *lenp = 3; fid->udf.block = location.logicalBlockNum; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 12f39b9e443..d6f681535eb 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -306,7 +306,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); ufs_set_link(new_dir, new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -318,12 +317,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= UFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = ufs_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -331,12 +327,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME_SEC; ufs_delete_entry(old_dir, old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { ufs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ac1c7e8378d..f83a4c830a6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -2022,11 +2022,12 @@ xfs_buf_init(void) if (!xfslogd_workqueue) goto out_free_buf_zone; - xfsdatad_workqueue = create_workqueue("xfsdatad"); + xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfsconvertd_workqueue = create_workqueue("xfsconvertd"); + xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", + WQ_MEM_RECLAIM, 1); if (!xfsconvertd_workqueue) goto out_destroy_xfsdatad_workqueue; diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c index 05201ae719e..d61611c8801 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -152,6 +152,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); + if (!blk_queue_discard(q)) + return -XFS_ERROR(EOPNOTSUPP); if (copy_from_user(&range, urange, sizeof(range))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index fc0114da7fd..f4f878fc008 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -89,8 +89,10 @@ xfs_fs_encode_fh( * seven combinations work. The real answer is "don't use v2". */ len = xfs_fileid_length(fileid_type); - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } *max_len = len; switch (fileid_type) { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index f5e2a19e0f8..0ca0e3c024d 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1( xfs_mount_t *mp, void __user *arg) { - xfs_fsop_geom_v1_t fsgeo; + xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) return -error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) return -XFS_ERROR(EFAULT); return 0; } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index bd5727852fd..9ff7fc603d2 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -102,7 +102,8 @@ xfs_mark_inode_dirty( STATIC int xfs_init_security( struct inode *inode, - struct inode *dir) + struct inode *dir, + const struct qstr *qstr) { struct xfs_inode *ip = XFS_I(inode); size_t length; @@ -110,7 +111,7 @@ xfs_init_security( unsigned char *name; int error; - error = security_inode_init_security(inode, dir, (char **)&name, + error = security_inode_init_security(inode, dir, qstr, (char **)&name, &value, &length); if (error) { if (error == -EOPNOTSUPP) @@ -194,7 +195,7 @@ xfs_vn_mknod( inode = VFS_I(ip); - error = xfs_init_security(inode, dir); + error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -367,7 +368,7 @@ xfs_vn_symlink( inode = VFS_I(cip); - error = xfs_init_security(inode, dir); + error = xfs_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cec89dd5d7d..85668efb3e3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -53,6 +53,9 @@ xfs_fs_geometry( xfs_fsop_geom_t *geo, int new_version) { + + memset(geo, 0, sizeof(*geo)); + geo->blocksize = mp->m_sb.sb_blocksize; geo->rtextsize = mp->m_sb.sb_rextsize; geo->agblocks = mp->m_sb.sb_agblocks; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index edfa178bafb..4aff5639573 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -309,7 +309,7 @@ xfs_mru_cache_init(void) if (!xfs_mru_elem_zone) goto out; - xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); if (!xfs_mru_reap_wq) goto out_destroy_mru_elem_zone; |