diff options
Diffstat (limited to 'fs')
542 files changed, 18268 insertions, 14875 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 945aa5f02f9..a9ea73d6dcf 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, uint16_t klen = 0; v9ses = (struct v9fs_session_info *)cookie_netfs_data; - P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, - buffer, bufmax); + p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", + v9ses, buffer, bufmax); if (v9ses->cachetag) klen = strlen(v9ses->cachetag); @@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, return 0; memcpy(buffer, v9ses->cachetag, klen); - P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); + p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); return klen; } @@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, v9ses); - P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", + v9ses, v9ses->fscache); } void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, - v9ses->fscache); + p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", + v9ses, v9ses->fscache); fscache_relinquish_cookie(v9ses->fscache, 0); v9ses->fscache = NULL; } @@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, - v9inode->qid.path); + p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", + &v9inode->vfs_inode, v9inode->qid.path); return sizeof(v9inode->qid.path); } @@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, const struct v9fs_inode *v9inode = cookie_netfs_data; *size = i_size_read(&v9inode->vfs_inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, - *size); + p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", + &v9inode->vfs_inode, *size); } static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, @@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, - v9inode->qid.version); + p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", + &v9inode->vfs_inode, v9inode->qid.version); return sizeof(v9inode->qid.version); } @@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", + inode, v9inode->fscache); } void v9fs_cache_inode_put_cookie(struct inode *inode) @@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 0); v9inode->fscache = NULL; @@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) if (!v9inode->fscache) return; - P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, - v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", + inode, v9inode->fscache); fscache_relinquish_cookie(v9inode->fscache, 1); v9inode->fscache = NULL; @@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, v9inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", - inode, old, v9inode->fscache); + p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", + inode, old, v9inode->fscache); spin_unlock(&v9inode->fscache_lock); } @@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (!v9inode->fscache) return -ENOBUFS; @@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret); return 1; case 0: - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode, int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); + p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages); if (!v9inode->fscache) return -ENOBUFS; @@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode, switch (ret) { case -ENOBUFS: case -ENODATA: - P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); + p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret); return 1; case 0: BUG_ON(!list_empty(pages)); BUG_ON(*nr_pages != 0); - P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); + p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); return ret; default: - P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); + p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); return ret; } } @@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) int ret; const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); - P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); + p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); if (ret != 0) v9fs_uncache_page(inode, page); } @@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) { const struct v9fs_inode *v9inode = V9FS_I(inode); - P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); + p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); if (PageFsCache(page)) fscache_wait_on_page_write(v9inode->fscache, page); } diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 85b67ffa2a4..da8eefbe830 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { struct v9fs_dentry *dent; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", - fid->fid, dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n", + fid->fid, dentry->d_name.name); dent = dentry->d_fsdata; if (!dent) { @@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) struct v9fs_dentry *dent; struct p9_fid *fid, *ret; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, uid, any); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", + dentry->d_name.name, dentry, uid, any); dent = (struct v9fs_dentry *) dentry->d_fsdata; ret = NULL; if (dent) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 2b78014a124..1964f98e74b 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -85,15 +87,15 @@ static int get_cache_mode(char *s) if (!strcmp(s, "loose")) { version = CACHE_LOOSE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; - P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); + p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); } else - printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); + pr_info("Unknown Cache mode %s\n", s); return version; } @@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_debug: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltuid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_dfltgid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_afid: r = match_int(&args[0], &option); if (r < 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); ret = r; continue; } @@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of cache arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); goto free_and_return; } ret = get_cache_mode(s); @@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; - P9_DPRINTK(P9_DEBUG_ERROR, - "problem allocating copy of access arg\n"); + p9_debug(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); goto free_and_return; } @@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->uid = simple_strtoul(s, &e, 10); if (*e != '\0') { ret = -EINVAL; - printk(KERN_INFO "9p: Unknown access " - "argument %s.\n", s); + pr_info("Unknown access argument %s\n", + s); kfree(s); goto free_and_return; } @@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FS_POSIX_ACL v9ses->flags |= V9FS_POSIX_ACL; #else - P9_DPRINTK(P9_DEBUG_ERROR, - "Not defined CONFIG_9P_FS_POSIX_ACL. " - "Ignoring posixacl option\n"); + p9_debug(P9_DEBUG_ERROR, + "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n"); #endif break; @@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); + p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); goto error; } @@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(fid)) { retval = PTR_ERR(fid); fid = NULL; - P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); + p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); goto error; } @@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } @@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) { - P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); + p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); p9_client_begin_disconnect(v9ses->clnt); } @@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void) static int __init init_v9fs(void) { int err; - printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); + pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ err = register_filesystem(&v9fs_fs_type); if (err < 0) { - printk(KERN_ERR "Failed to register filesystem\n"); + pr_err("Failed to register filesystem\n"); return err; } err = v9fs_cache_register(); if (err < 0) { - printk(KERN_ERR "Failed to register v9fs for caching\n"); + pr_err("Failed to register v9fs for caching\n"); goto out_fs_unreg; } err = v9fs_sysfs_init(); if (err < 0) { - printk(KERN_ERR "Failed to register with sysfs\n"); + pr_err("Failed to register with sysfs\n"); goto out_sysfs_cleanup; } diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 410ffd6ceb5..dc95a252523 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t); + struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 2524e4cbb8e..0ad61c6a65a 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) struct inode *inode; inode = page->mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); BUG_ON(!PageLocked(page)); @@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, struct inode *inode; inode = mapping->host; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); if (ret == 0) return ret; ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); - P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); + p9_debug(P9_DEBUG_VFS, " = %d\n", ret); return ret; } @@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * Now that we do caching with cache mode enabled, We need * to support direct IO */ - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " - "off/no(%lld/%lu) EINVAL\n", - iocb->ki_filp->f_path.dentry->d_name.name, - (long long) pos, nr_segs); + p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp->f_path.dentry->d_name.name, + (long long)pos, nr_segs); return -EINVAL; } diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index e022890c6f4..d529437ff44 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,8 +53,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); return 1; } @@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry) */ static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); /* Don't cache negative dentries */ if (!dentry->d_inode) @@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry) struct v9fs_dentry *dent; struct p9_fid *temp, *current_fid; - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", + dentry->d_name.name, dentry); dent = dentry->d_fsdata; if (dent) { list_for_each_entry_safe(current_fid, temp, &dent->fidlist, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 598fff1a54e..ff911e77965 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int reclen = 0; struct p9_rdir *rdir; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; @@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; p9stat_free(&st); goto unlock_and_exit; @@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, struct p9_dirent curdirent; u64 oldoffset = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; @@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, rdir->tail - rdir->head, &curdirent); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + p9_debug(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; goto unlock_and_exit; } @@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) struct p9_fid *fid; fid = filp->private_data; - P9_DPRINTK(P9_DEBUG_VFS, - "v9fs_dir_release: inode: %p filp: %p fid: %d\n", - inode, filp, fid ? fid->fid : -1); + p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); if (fid) p9_client_clunk(fid); return 0; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 62857a810a7..fc06fd27065 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); + p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) @@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) int res = 0; struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); + p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - schedule_timeout_interruptible(P9_LOCK_TIMEOUT); + if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + break; } /* map 9p status to VFS status */ @@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct inode *inode = filp->f_path.dentry->d_inode; int ret = -ENOLCK; - P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, - cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", + filp, cmd, fl, filp->f_path.dentry->d_name.name); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, { int n, total, size; - P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, - (long long unsigned) offset, count); + p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", + fid->fid, (long long unsigned)offset, count); n = 0; total = 0; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, struct p9_fid *fid; size_t size; - P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); + p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; @@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, loff_t origin = *offset; unsigned long pg_start, pg_end; - P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, - (int)count, (int)*offset); + p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", + data, (int)count, (int)*offset); clnt = fid->clnt; do { @@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); @@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, return retval; mutex_lock(&inode->i_mutex); - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", - filp, datasync); + p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; @@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = filp->f_path.dentry->d_inode; - P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", - page, (unsigned long)filp->private_data); + p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", + page, (unsigned long)filp->private_data); v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 879ed885173..014c8dd6296 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -23,6 +23,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> @@ -59,15 +61,13 @@ static const struct inode_operations v9fs_symlink_inode_operations; * */ -static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) +static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) { int res; res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; if (v9fs_proto_dotu(v9ses)) { - if (S_ISLNK(mode)) - res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { if (S_ISSOCK(mode)) res |= P9_DMSOCKET; @@ -85,10 +85,33 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res |= P9_DMSETGID; if ((mode & S_ISVTX) == S_ISVTX) res |= P9_DMSETVTX; - if ((mode & P9_DMLINK)) - res |= P9_DMLINK; } + return res; +} +/** + * p9mode2perm- convert plan9 mode bits to unix permission bits + * @v9ses: v9fs session information + * @stat: p9_wstat from which mode need to be derived + * + */ +static int p9mode2perm(struct v9fs_session_info *v9ses, + struct p9_wstat *stat) +{ + int res; + int mode = stat->mode; + + res = mode & S_IALLUGO; + if (v9fs_proto_dotu(v9ses)) { + if ((mode & P9_DMSETUID) == P9_DMSETUID) + res |= S_ISUID; + + if ((mode & P9_DMSETGID) == P9_DMSETGID) + res |= S_ISGID; + + if ((mode & P9_DMSETVTX) == P9_DMSETVTX) + res |= S_ISVTX; + } return res; } @@ -99,14 +122,14 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) * @rdev: major number, minor number in case of device files. * */ -static int p9mode2unixmode(struct v9fs_session_info *v9ses, - struct p9_wstat *stat, dev_t *rdev) +static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; - int mode = stat->mode; + u32 mode = stat->mode; - res = mode & S_IALLUGO; *rdev = 0; + res = p9mode2perm(v9ses, stat); if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -133,24 +156,13 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, res |= S_IFBLK; break; default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); + p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", + type, stat->extension); }; *rdev = MKDEV(major, minor); } else res |= S_IFREG; - if (v9fs_proto_dotu(v9ses)) { - if ((mode & P9_DMSETUID) == P9_DMSETUID) - res |= S_ISUID; - - if ((mode & P9_DMSETGID) == P9_DMSETGID) - res |= S_ISGID; - - if ((mode & P9_DMSETVTX) == P9_DMSETVTX) - res |= S_ISVTX; - } return res; } @@ -251,7 +263,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) static void v9fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } @@ -261,7 +272,7 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode, dev_t rdev) + struct inode *inode, umode_t mode, dev_t rdev) { int err = 0; @@ -281,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, } else if (v9fs_proto_dotu(v9ses)) { inode->i_op = &v9fs_file_inode_operations; } else { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); + p9_debug(P9_DEBUG_ERROR, + "special files without extended mode\n"); err = -EINVAL; goto error; } @@ -307,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; case S_IFLNK: if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " - "legacy protocol.\n"); + p9_debug(P9_DEBUG_ERROR, + "extended modes used with legacy protocol\n"); err = -EINVAL; goto error; } @@ -335,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, break; default: - P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); + p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", + mode, mode & S_IFMT); err = -EINVAL; goto error; } @@ -352,17 +363,18 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) { int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); + p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); inode = new_inode(sb); if (!inode) { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + pr_warn("%s (%d): Problem allocating inode\n", + __func__, task_pid_nr(current)); return ERR_PTR(-ENOMEM); } err = v9fs_init_inode(v9ses, inode, mode, rdev); @@ -492,7 +504,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, int new) { dev_t rdev; - int retval, umode; + int retval; + umode_t umode; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; @@ -578,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) struct p9_fid *v9fid, *dfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", - dir, dentry, flags); + p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", + dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); inode = dentry->d_inode; dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); return retval; } if (v9fs_proto_dotl(v9ses)) @@ -635,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; ofid = NULL; @@ -644,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return ERR_PTR(err); } @@ -652,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); goto error; } - /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - fid = NULL; - goto error; - } - - /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); - goto error; + if (!(perm & P9_DMLINK)) { + /* now walk from the parent so we can get unopened fid */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + p9_debug(P9_DEBUG_VFS, + "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + /* + * instantiate inode and assign the unopened fid to the dentry + */ + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, + "inode creation failed %d\n", err); + goto error; + } + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + d_instantiate(dentry, inode); } - err = v9fs_fid_add(dentry, fid); - if (err < 0) - goto error; - d_instantiate(dentry, inode); return ofid; error: if (ofid) @@ -703,7 +721,7 @@ error: */ static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; @@ -786,14 +804,14 @@ error: * */ -static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int err; u32 perm; struct p9_fid *fid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode | S_IFDIR); @@ -831,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, char *name; int result = 0; - P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_name.name, dentry, nameidata); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", + dir, dentry->d_name.name, dentry, nameidata); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -938,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = old_dentry->d_inode; new_inode = new_dentry->d_inode; @@ -974,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * 9P .u can only handle file rename in the same directory */ - P9_DPRINTK(P9_DEBUG_ERROR, - "old dir and new dir are different\n"); + p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } @@ -1031,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -1068,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_wstat wstat; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; @@ -1131,7 +1148,7 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { - mode_t mode; + umode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; @@ -1167,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, set_nlink(inode, i_nlink); } } - mode = stat->mode & S_IALLUGO; + mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; i_size_write(inode, stat->length); @@ -1213,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct p9_fid *fid; struct p9_wstat *st; - P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); retval = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); @@ -1235,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) /* copy extension buffer into buffer */ strncpy(buffer, st->extension, buflen); - P9_DPRINTK(P9_DEBUG_VFS, - "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); + p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n", + dentry->d_name.name, st->extension, buffer); retval = strnlen(buffer, buflen); done: @@ -1257,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) int len = 0; char *link = __getname(); - P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) link = ERR_PTR(-ENOMEM); @@ -1288,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); - P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, - IS_ERR(s) ? "<error>" : s); + p9_debug(P9_DEBUG_VFS, " %s %s\n", + dentry->d_name.name, IS_ERR(s) ? "<error>" : s); if (!IS_ERR(s)) __putname(s); } @@ -1304,19 +1321,17 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) */ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, - int mode, const char *extension) + u32 perm, const char *extension) { - u32 perm; struct p9_fid *fid; struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(dir); if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); + p9_debug(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } - perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, (char *) extension, perm, P9_OREAD); if (IS_ERR(fid)) @@ -1340,10 +1355,10 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, - dentry->d_name.name, symname); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, symname); - return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); + return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } /** @@ -1362,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char *name; struct p9_fid *oldfid; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, - old_dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", + dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); oldfid = v9fs_fid_clone(old_dentry); if (IS_ERR(oldfid)) @@ -1398,14 +1412,16 @@ clunk_fid: */ static int -v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; char *name; + u32 perm; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, mode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; @@ -1427,7 +1443,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return -EINVAL; } - retval = v9fs_vfs_mkspecial(dir, dentry, mode, name); + perm = unixmode2p9mode(v9ses, mode); + retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); __putname(name); return retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0b5745e2194..a1e6c990cd4 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -48,7 +48,7 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** @@ -253,7 +253,7 @@ int v9fs_open_to_dotl_flags(int flags) */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct nameidata *nd) { int err = 0; @@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, } name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " - "mode:0x%x\n", name, flags, omode); + p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", + name, flags, omode); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, ofid = p9_client_walk(dfid, 0, NULL, 1); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); return err; } @@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in creat %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + err); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), mode, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_open_dotl failed in creat %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + err); goto error; } v9fs_invalidate_inode_attr(dir); @@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -395,7 +394,7 @@ err_clunk_old_fid: */ static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, int omode) + struct dentry *dentry, umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); err = 0; v9ses = v9fs_inode2v9ses(dir); @@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mkdir %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct p9_fid *fid; struct p9_stat_dotl *st; - P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { @@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, return 0; } +/* + * Attribute flags. + */ +#define P9_ATTR_MODE (1 << 0) +#define P9_ATTR_UID (1 << 1) +#define P9_ATTR_GID (1 << 2) +#define P9_ATTR_SIZE (1 << 3) +#define P9_ATTR_ATIME (1 << 4) +#define P9_ATTR_MTIME (1 << 5) +#define P9_ATTR_CTIME (1 << 6) +#define P9_ATTR_ATIME_SET (1 << 7) +#define P9_ATTR_MTIME_SET (1 << 8) + +struct dotl_iattr_map { + int iattr_valid; + int p9_iattr_valid; +}; + +static int v9fs_mapped_iattr_valid(int iattr_valid) +{ + int i; + int p9_iattr_valid = 0; + struct dotl_iattr_map dotl_iattr_map[] = { + { ATTR_MODE, P9_ATTR_MODE }, + { ATTR_UID, P9_ATTR_UID }, + { ATTR_GID, P9_ATTR_GID }, + { ATTR_SIZE, P9_ATTR_SIZE }, + { ATTR_ATIME, P9_ATTR_ATIME }, + { ATTR_MTIME, P9_ATTR_MTIME }, + { ATTR_CTIME, P9_ATTR_CTIME }, + { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, + { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, + }; + for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { + if (iattr_valid & dotl_iattr_map[i].iattr_valid) + p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; + } + return p9_iattr_valid; +} + /** * v9fs_vfs_setattr_dotl - set file metadata * @dentry: file whose metadata to set @@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) struct p9_fid *fid; struct p9_iattr_dotl p9attr; - P9_DPRINTK(P9_DEBUG_VFS, "\n"); + p9_debug(P9_DEBUG_VFS, "\n"); retval = inode_change_ok(dentry->d_inode, iattr); if (retval) return retval; - p9attr.valid = iattr->ia_valid; + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); p9attr.mode = iattr->ia_mode; p9attr.uid = iattr->ia_uid; p9attr.gid = iattr->ia_gid; @@ -594,7 +633,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { - mode_t mode; + umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, struct v9fs_session_info *v9ses; name = (char *) dentry->d_name.name; - P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", - dir->i_ino, name, symname); + p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } @@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } @@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", - dir->i_ino, old_dentry->d_name.name, - dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); v9ses = v9fs_inode2v9ses(dir); dir_dentry = v9fs_dentry_from_dir_inode(dir); @@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } @@ -799,7 +836,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; @@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - P9_DPRINTK(P9_DEBUG_VFS, - " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, - dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); + p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry->d_name.name, omode, + MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) return -EINVAL; @@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } @@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - P9_DPRINTK(P9_DEBUG_VFS, - "Failed to get acl values in mknod %d\n", err); + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", + err); goto error; } name = (char *) dentry->d_name.name; @@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); - P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); fid = NULL; goto error; } @@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); - P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", - err); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); goto error; } err = v9fs_fid_add(dentry, fid); @@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) char *link = __getname(); char *target; - P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); if (!link) { link = ERR_PTR(-ENOMEM); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index c70251d47ed..7b0cd87b07c 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -117,11 +117,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - int mode = S_IRWXUGO | S_ISVTX; + umode_t mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; - P9_DPRINTK(P9_DEBUG_VFS, " \n"); + p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) @@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; v9fs_fid_add(root, fid); - P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: @@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; - P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); + p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); @@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); + p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void @@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; @@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); if (!v9inode->writeback_fid) return 0; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index d288773871b..29653b70a9c 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { retval = PTR_ERR(attr_fid); - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_attrwalk failed %zd\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n", + retval); attr_fid = NULL; goto error; } @@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, { struct p9_fid *fid; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", - __func__, name, buffer_size); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + name, buffer_size); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, int retval, msize, write_count; struct p9_fid *fid = NULL; - P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", - __func__, name, value_len, flags); + p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", + name, value_len, flags); fid = v9fs_fid_clone(dentry); if (IS_ERR(fid)) { @@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, */ retval = p9_client_xattrcreate(fid, name, value_len, flags); if (retval < 0) { - P9_DPRINTK(P9_DEBUG_VFS, - "p9_client_xattrcreate failed %d\n", retval); + p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", + retval); goto error; } msize = fid->clnt->msize; diff --git a/fs/Kconfig b/fs/Kconfig index 5f4c45d4aa1..d621f02a3f9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -218,6 +218,8 @@ source "fs/exofs/Kconfig" endif # MISC_FILESYSTEMS +source "fs/exofs/Kconfig.ore" + menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" default y @@ -266,14 +268,6 @@ source "fs/9p/Kconfig" endif # NETWORK_FILESYSTEMS -if BLOCK -menu "Partition Types" - -source "fs/partitions/Kconfig" - -endmenu -endif - source "fs/nls/Kconfig" source "fs/dlm/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7973b..e95d1b64082 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF +config ARCH_BINFMT_ELF_RANDOMIZE_PIE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y diff --git a/fs/Makefile b/fs/Makefile index d2c3353d547..93804d4d66e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,8 @@ else obj-y += no-block.o endif +obj-$(CONFIG_PROC_FS) += proc_namespace.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o @@ -52,7 +54,6 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index c8bf36a1996..8e3b36ace30 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -126,9 +126,9 @@ static void adfs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int adfs_show_options(struct seq_file *seq, struct dentry *root) { - struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb); + struct adfs_sb_info *asb = ADFS_SB(root->d_sb); if (asb->s_uid != 0) seq_printf(seq, ",uid=%u", asb->s_uid); diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c2b9c79eb64..45a0ce45d7b 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -136,7 +136,7 @@ extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); -extern mode_t prot_to_mode(u32 prot); +extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); @@ -156,8 +156,8 @@ extern void affs_free_bitmap(struct super_block *sb); extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *); -extern int affs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *); +extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index de37ec84234..52a6407682e 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -390,10 +390,10 @@ secs_to_datestamp(time_t secs, struct affs_date *ds) ds->ticks = cpu_to_be32(secs * 50); } -mode_t +umode_t prot_to_mode(u32 prot) { - int mode = 0; + umode_t mode = 0; if (!(prot & FIBF_NOWRITE)) mode |= S_IWUSR; @@ -421,7 +421,7 @@ void mode_to_prot(struct inode *inode) { u32 prot = AFFS_I(inode)->i_protect; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (!(mode & S_IXUSR)) prot |= FIBF_NOEXECUTE; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 780a11dc631..47806940aac 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -255,13 +255,13 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; struct inode *inode; int error; - pr_debug("AFFS: create(%lu,\"%.*s\",0%o)\n",dir->i_ino,(int)dentry->d_name.len, + pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len, dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -285,12 +285,12 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata } int -affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int error; - pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino, + pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino, (int)dentry->d_name.len,dentry->d_name.name,mode); inode = affs_new_inode(dir); diff --git a/fs/affs/super.c b/fs/affs/super.c index b31507d0f9b..8ba73fed796 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -98,7 +98,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb) static void affs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1b0b1955001..e22dc4b4a50 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,9 +28,9 @@ static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -764,7 +764,7 @@ static void afs_d_release(struct dentry *dentry) /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct afs_file_status status; struct afs_callback cb; @@ -777,7 +777,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o", + _enter("{%x:%u},{%s},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; @@ -948,7 +948,7 @@ error: /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, int mode, +static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct afs_file_status status; @@ -962,7 +962,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, int mode, dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%o,", + _enter("{%x:%u},{%s},%ho,", dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); ret = -ENAMETOOLONG; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d2b0888126d..a306bb6d88d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -109,7 +109,7 @@ struct afs_call { unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned short offset; /* offset into received data store */ + unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index aa59184151d..8f4ce2658b7 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -242,7 +242,7 @@ struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); + _enter("{%s}", path->dentry->d_name.name); newmnt = afs_mntpt_do_automount(path->dentry); if (IS_ERR(newmnt)) @@ -252,7 +252,7 @@ struct vfsmount *afs_d_automount(struct path *path) mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); - _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + _leave(" = %p", newmnt); return newmnt; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e45a323aebb..8ad8c2a0703 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; + struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); + while ((skb = skb_dequeue(&call->rx_queue))) + afs_free_skb(skb); rxrpc_kernel_end_call(rxcall); call->rxcall = NULL; error_kill_call: diff --git a/fs/afs/super.c b/fs/afs/super.c index 356dcf0929e..983ec59fc80 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -495,7 +495,6 @@ static void afs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct afs_vnode *vnode = AFS_FS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(afs_inode_cachep, vnode); } @@ -228,12 +228,6 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -static inline void get_ioctx(struct kioctx *kioctx) -{ - BUG_ON(atomic_read(&kioctx->users) <= 0); - atomic_inc(&kioctx->users); -} - static inline int try_get_ioctx(struct kioctx *kioctx) { return atomic_inc_not_zero(&kioctx->users); @@ -273,7 +267,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) mm = ctx->mm = current->mm; atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 1); + atomic_set(&ctx->users, 2); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -476,14 +470,23 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total) batch->count = total; } -static void kiocb_batch_free(struct kiocb_batch *batch) +static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) { struct kiocb *req, *n; + if (list_empty(&batch->head)) + return; + + spin_lock_irq(&ctx->ctx_lock); list_for_each_entry_safe(req, n, &batch->head, ki_batch) { list_del(&req->ki_batch); + list_del(&req->ki_list); kmem_cache_free(kiocb_cachep, req); + ctx->reqs_active--; } + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up_all(&ctx->wait); + spin_unlock_irq(&ctx->ctx_lock); } /* @@ -600,11 +603,16 @@ static void aio_fput_routine(struct work_struct *data) fput(req->ki_filp); /* Link the iocb into the context's free list */ + rcu_read_lock(); spin_lock_irq(&ctx->ctx_lock); really_put_req(ctx, req); + /* + * at that point ctx might've been killed, but actual + * freeing is RCU'd + */ spin_unlock_irq(&ctx->ctx_lock); + rcu_read_unlock(); - put_ioctx(ctx); spin_lock_irq(&fput_lock); } spin_unlock_irq(&fput_lock); @@ -635,7 +643,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * this function will be executed w/out any aio kthread wakeup. */ if (unlikely(!fput_atomic(req->ki_filp))) { - get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); @@ -1329,10 +1336,10 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) + if (!ret) { + put_ioctx(ioctx); return 0; - - get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ + } io_destroy(ioctx); } @@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - kiocb_batch_free(&batch); + kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } diff --git a/fs/attr.c b/fs/attr.c index 7ee7ba48831..95053ad8abc 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -166,7 +166,7 @@ EXPORT_SYMBOL(setattr_copy); int notify_change(struct dentry * dentry, struct iattr * attr) { struct inode *inode = dentry->d_inode; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error; struct timespec now; unsigned int ia_valid = attr->ia_valid; @@ -177,7 +177,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr) } if ((ia_valid & ATTR_MODE)) { - mode_t amode = attr->ia_mode; + umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ if (is_sxid(amode)) inode->i_flags &= ~S_NOSEC; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 326dc08d3e3..eb1cc92cd67 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -110,12 +110,14 @@ struct autofs_sb_info { int sub_version; int min_proto; int max_proto; + int compat_daemon; unsigned long exp_timeout; unsigned int type; int reghost_enabled; int needs_reghost; struct super_block *sb; struct mutex wq_mutex; + struct mutex pipe_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ spinlock_t lookup_lock; @@ -155,7 +157,7 @@ static inline int autofs4_ispending(struct dentry *dentry) return 0; } -struct inode *autofs4_get_inode(struct super_block *, mode_t); +struct inode *autofs4_get_inode(struct super_block *, umode_t); void autofs4_free_ino(struct autofs_info *); /* Expiration */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 509fe1eb66a..85f1fcdb30e 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -194,7 +194,7 @@ static int find_autofs_mount(const char *pathname, return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { - if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); if (!err) /* already found some */ @@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname, static int test_by_dev(struct path *path, void *p) { - return path->mnt->mnt_sb->s_dev == *(dev_t *)p; + return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(struct path *path, void *p) @@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; + sbi->compat_daemon = is_compat_task(); } out: mutex_unlock(&sbi->wq_mutex); @@ -538,11 +539,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; - devid = new_encode_dev(path.mnt->mnt_sb->s_dev); + devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; @@ -556,7 +557,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); if (follow_down_one(&path)) - magic = path.mnt->mnt_sb->s_magic; + magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 450f529a4ea..1feb68ecef9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -124,6 +124,7 @@ start: /* Negative dentry - try next */ if (!simple_positive(q)) { spin_unlock(&p->d_lock); + lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); p = q; goto again; } @@ -186,6 +187,7 @@ again: /* Negative dentry - try next */ if (!simple_positive(ret)) { spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); p = ret; goto again; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 8179f1ab817..06858d95512 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,6 +19,7 @@ #include <linux/parser.h> #include <linux/bitops.h> #include <linux/magic.h> +#include <linux/compat.h> #include "autofs_i.h" #include <linux/module.h> @@ -70,10 +71,10 @@ out_kill_sb: kill_litter_super(sb); } -static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) +static int autofs4_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb); - struct inode *root_inode = mnt->mnt_sb->s_root->d_inode; + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct inode *root_inode = root->d_sb->s_root->d_inode; if (!sbi) return 0; @@ -224,7 +225,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; + sbi->compat_daemon = is_compat_task(); mutex_init(&sbi->wq_mutex); + mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; spin_lock_init(&sbi->lookup_lock); @@ -326,7 +329,7 @@ fail_unlock: return -EINVAL; } -struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) +struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index f55ae23b137..75e5f1c8e02 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -26,7 +26,7 @@ static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); -static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); +static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); #ifdef CONFIG_COMPAT static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); @@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e1fbdeef85d..9c098db4334 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct file *file, const void *addr, int bytes) +static int autofs4_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; mm_segment_t fs; const char *data = (const char *)addr; ssize_t wr = 0; - /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/ - sigpipe = sigismember(¤t->pending.signal, SIGPIPE); /* Save pointer to user space and point back to kernel space */ fs = get_fs(); set_fs(KERNEL_DS); + mutex_lock(&sbi->pipe_mutex); while (bytes && (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { data += wr; bytes -= wr; } + mutex_unlock(&sbi->pipe_mutex); set_fs(fs); @@ -90,7 +91,24 @@ static int autofs4_write(struct file *file, const void *addr, int bytes) return (bytes > 0); } - + +/* + * The autofs_v5 packet was misdesigned. + * + * The packets are identical on x86-32 and x86-64, but have different + * alignment. Which means that 'sizeof()' will give different results. + * Fix it up for the case of running 32-bit user mode on a 64-bit kernel. + */ +static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi) +{ + size_t pktsz = sizeof(struct autofs_v5_packet); +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (sbi->compat_daemon > 0) + pktsz -= 4; +#endif + return pktsz; +} + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -110,6 +128,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; + mutex_lock(&sbi->wq_mutex); + + /* Check if we have become catatonic */ + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: @@ -147,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - pktsz = sizeof(*packet); - + pktsz = autofs_v5_packet_size(sbi); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); @@ -163,22 +187,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } default: printk("autofs4_notify_daemon: bad type %d!\n", type); + mutex_unlock(&sbi->wq_mutex); return; } - /* Check if we have become catatonic */ - mutex_lock(&sbi->wq_mutex); - if (!sbi->catatonic) { - pipe = sbi->pipe; - get_file(pipe); - } + pipe = sbi->pipe; + get_file(pipe); + mutex_unlock(&sbi->wq_mutex); - if (pipe) { - if (autofs4_write(pipe, &pkt, pktsz)) - autofs4_catatonic_mode(sbi); - fput(pipe); - } + if (autofs4_write(sbi, pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); } static int autofs4_getpath(struct autofs_sb_info *sbi, @@ -257,6 +277,9 @@ static int validate_request(struct autofs_wait_queue **wait, struct autofs_wait_queue *wq; struct autofs_info *ino; + if (sbi->catatonic) + return -ENOENT; + /* Wait in progress, continue; */ wq = autofs4_find_wait(sbi, qstr); if (wq) { @@ -289,6 +312,9 @@ static int validate_request(struct autofs_wait_queue **wait, if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; + if (sbi->catatonic) + return -ENOENT; + wq = autofs4_find_wait(sbi, qstr); if (wq) { *wait = wq; @@ -389,7 +415,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, ret = validate_request(&wq, sbi, &qstr, dentry, notify); if (ret <= 0) { - if (ret == 0) + if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); return ret; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9205cf25f1c..22e9a78872f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -173,7 +173,7 @@ static const struct file_operations bad_file_ops = }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { return -EIO; } @@ -202,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, } static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return -EIO; } @@ -213,7 +213,7 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) } static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { return -EIO; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8342ca67abc..6e6d536767f 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -286,7 +286,6 @@ befs_alloc_inode(struct super_block *sb) static void befs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 9cc07401947..d12c7966db2 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -84,7 +84,7 @@ const struct file_operations bfs_dir_operations = { extern void dump_imap(const char *, struct super_block *); -static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int err; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 697af5bf70b..b0391bc402b 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -251,7 +251,6 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) static void bfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26a..1ff94054d35 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -352,13 +359,6 @@ beyond_if: return retval; } - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); #ifdef __alpha__ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 21ac5ee4b43..07d096c4992 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#if defined(CONFIG_X86) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off * in runtime via sysctl. * If that is the case, retain the original non-zero @@ -1421,7 +1421,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && + if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; size_t size = regset->n * regset->size; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1e9edbdeda7..a9198dfd5f8 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -560,7 +560,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 2: set_bit(Enabled, &e->flags); break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); kill_node(e); @@ -587,7 +587,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, Node *e; struct inode *inode; struct dentry *root, *dentry; - struct super_block *sb = file->f_path.mnt->mnt_sb; + struct super_block *sb = file->f_path.dentry->d_sb; int err = 0; e = create_entry(buffer, count); @@ -666,7 +666,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, switch (res) { case 1: enabled = 0; break; case 2: enabled = 1; break; - case 3: root = dget(file->f_path.mnt->mnt_sb->s_root); + case 3: root = dget(file->f_path.dentry->d_sb->s_root); mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) @@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > queue_max_segments(q)) - nr_pages = queue_max_segments(q); - - return nr_pages; + return min_t(unsigned, + queue_max_segments(q), + queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); } EXPORT_SYMBOL(bio_get_nr_vecs); diff --git a/fs/block_dev.c b/fs/block_dev.c index b07f1da1de4..5e9f198f771 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/buffer_head.h> +#include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> @@ -24,7 +25,7 @@ #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> -#include <linux/kmemleak.h> +#include <linux/cleancache.h> #include <asm/uaccess.h> #include "internal.h" @@ -82,13 +83,35 @@ static sector_t max_block(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) +void kill_bdev(struct block_device *bdev) { - if (bdev->bd_inode->i_mapping->nrpages == 0) + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) return; + invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); + truncate_inode_pages(mapping, 0); } +EXPORT_SYMBOL(kill_bdev); + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages == 0) + return; + + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); +} +EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { @@ -425,7 +448,6 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } @@ -493,12 +515,12 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +static struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; - struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -510,12 +532,7 @@ void __init bdev_cache_init(void) bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); - /* - * This vfsmount structure is only used to obtain the - * blockdev_superblock, so tell kmemleak not to report it. - */ - kmemleak_not_leak(bd_mnt); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* @@ -639,6 +656,11 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } +static inline int sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} + /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -1117,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1137,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1159,8 +1183,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(disk, bdev); + } if (ret) goto out_clear; } else { @@ -1190,8 +1218,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) - rescan_partitions(bdev->bd_disk, bdev); + if (bdev->bd_invalidated) { + if (!ret) + rescan_partitions(bdev->bd_disk, bdev); + else if (ret == -ENOMEDIUM) + invalidate_partitions(bdev->bd_disk, bdev); + } if (ret) goto out_unlock_bdev; } @@ -1210,6 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ecb9fd3be14..d33f01c08b6 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config BTRFS_FS_CHECK_INTEGRITY + bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + depends on BTRFS_FS + help + Adds code that examines all block write requests (including + writes of the super block). The goal is to verify that the + state of the filesystem on disk is always consistent, i.e., + after a power-loss or kernel panic event the filesystem is + in a consistent state. + + If the integrity check tool is included and activated in + the mount options, plenty of kernel memory is used, and + plenty of additional CPU cycles are spent. Enabling this + functionality is not intended for normal use. + + In most cases, unless you are a btrfs developer who needs + to verify the integrity of (super)-block write requests + during the run of a regression test, say N diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index c0ddfd29c5e..0c4fa2befae 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o + reada.o backref.o ulist.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o +btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 0b394580d86..0cc20b35c1c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -334,7 +334,7 @@ again: if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); - refrigerator(); + try_to_freeze(); } else { spin_unlock_irq(&worker->lock); if (!kthread_should_stop()) { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22c64fff1bd..0436c12da8c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,18 +19,793 @@ #include "ctree.h" #include "disk-io.h" #include "backref.h" +#include "ulist.h" +#include "transaction.h" +#include "delayed-ref.h" -struct __data_ref { +/* + * this structure records all encountered refs on the way up to the root + */ +struct __prelim_ref { struct list_head list; - u64 inum; - u64 root; - u64 extent_data_item_offset; + u64 root_id; + struct btrfs_key key; + int level; + int count; + u64 parent; + u64 wanted_disk_byte; }; -struct __shared_ref { - struct list_head list; +static int __add_prelim_ref(struct list_head *head, u64 root_id, + struct btrfs_key *key, int level, u64 parent, + u64 wanted_disk_byte, int count) +{ + struct __prelim_ref *ref; + + /* in case we're adding delayed refs, we're holding the refs spinlock */ + ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + if (!ref) + return -ENOMEM; + + ref->root_id = root_id; + if (key) + ref->key = *key; + else + memset(&ref->key, 0, sizeof(ref->key)); + + ref->level = level; + ref->count = count; + ref->parent = parent; + ref->wanted_disk_byte = wanted_disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, + struct ulist *parents, + struct extent_buffer *eb, int level, + u64 wanted_objectid, u64 wanted_disk_byte) +{ + int ret; + int slot; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; u64 disk_byte; -}; + +add_parent: + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (ret < 0) + return ret; + + if (level != 0) + return 0; + + /* + * if the current leaf is full with EXTENT_DATA items, we must + * check the next one if that holds a reference as well. + * ref->count cannot be used to skip this check. + * repeat this until we don't find any additional EXTENT_DATA items. + */ + while (1) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret) + return 0; + + eb = path->nodes[0]; + for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.objectid != wanted_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + fi = btrfs_item_ptr(eb, slot, + struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte == wanted_disk_byte) + goto add_parent; + } + } + + return 0; +} + +/* + * resolve an indirect backref in the form (root_id, key, level) + * to a logical address + */ +static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct __prelim_ref *ref, + struct ulist *parents) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_key root_key; + struct btrfs_key key = {0}; + struct extent_buffer *eb; + int ret = 0; + int root_level; + int level = ref->level; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root_key.objectid = ref->root_id; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + + rcu_read_lock(); + root_level = btrfs_header_level(root->node); + rcu_read_unlock(); + + if (root_level + 1 == level) + goto out; + + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + pr_debug("search slot in root %llu (level %d, ref count %d) returned " + "%d for key (%llu %u %llu)\n", + (unsigned long long)ref->root_id, level, ref->count, ret, + (unsigned long long)ref->key.objectid, ref->key.type, + (unsigned long long)ref->key.offset); + if (ret < 0) + goto out; + + eb = path->nodes[level]; + if (!eb) { + WARN_ON(1); + ret = 1; + goto out; + } + + if (level == 0) { + if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + } + + /* the last two parameters will only be used for level == 0 */ + ret = add_all_parents(root, path, parents, eb, level, key.objectid, + ref->wanted_disk_byte); +out: + btrfs_free_path(path); + return ret; +} + +/* + * resolve all indirect backrefs from the list + */ +static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + int err; + int ret = 0; + struct __prelim_ref *ref; + struct __prelim_ref *ref_safe; + struct __prelim_ref *new_ref; + struct ulist *parents; + struct ulist_node *node; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + /* + * _safe allows us to insert directly after the current item without + * iterating over the newly inserted items. + * we're also allowed to re-assign ref during iteration. + */ + list_for_each_entry_safe(ref, ref_safe, head, list) { + if (ref->parent) /* already direct */ + continue; + if (ref->count == 0) + continue; + err = __resolve_indirect_ref(fs_info, ref, parents); + if (err) { + if (ret == 0) + ret = err; + continue; + } + + /* we put the first parent into the ref at hand */ + node = ulist_next(parents, NULL); + ref->parent = node ? node->val : 0; + + /* additional parents require new refs being added here */ + while ((node = ulist_next(parents, node))) { + new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + if (!new_ref) { + ret = -ENOMEM; + break; + } + memcpy(new_ref, ref, sizeof(*ref)); + new_ref->parent = node->val; + list_add(&new_ref->list, &ref->list); + } + ulist_reinit(parents); + } + + ulist_free(parents); + return ret; +} + +/* + * merge two lists of backrefs and adjust counts accordingly + * + * mode = 1: merge identical keys, if key is set + * mode = 2: merge identical parents + */ +static int __merge_refs(struct list_head *head, int mode) +{ + struct list_head *pos1; + + list_for_each(pos1, head) { + struct list_head *n2; + struct list_head *pos2; + struct __prelim_ref *ref1; + + ref1 = list_entry(pos1, struct __prelim_ref, list); + + if (mode == 1 && ref1->key.type == 0) + continue; + for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; + pos2 = n2, n2 = pos2->next) { + struct __prelim_ref *ref2; + + ref2 = list_entry(pos2, struct __prelim_ref, list); + + if (mode == 1) { + if (memcmp(&ref1->key, &ref2->key, + sizeof(ref1->key)) || + ref1->level != ref2->level || + ref1->root_id != ref2->root_id) + continue; + ref1->count += ref2->count; + } else { + if (ref1->parent != ref2->parent) + continue; + ref1->count += ref2->count; + } + list_del(&ref2->list); + kfree(ref2); + } + + } + return 0; +} + +/* + * add all currently queued delayed refs from this head whose seq nr is + * smaller or equal that seq to the list + */ +static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, + struct btrfs_key *info_key, + struct list_head *prefs) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + struct rb_node *n = &head->node.rb_node; + int sgn; + int ret = 0; + + if (extent_op && extent_op->update_key) + btrfs_disk_key_to_cpu(info_key, &extent_op->key); + + while ((n = rb_prev(n))) { + struct btrfs_delayed_ref_node *node; + node = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + if (node->bytenr != head->node.bytenr) + break; + WARN_ON(node->is_head); + + if (node->seq > seq) + continue; + + switch (node->action) { + case BTRFS_ADD_DELAYED_EXTENT: + case BTRFS_UPDATE_DELAYED_HEAD: + WARN_ON(1); + continue; + case BTRFS_ADD_DELAYED_REF: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + default: + BUG_ON(1); + } + switch (node->type) { + case BTRFS_TREE_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, 0, node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_BLOCK_REF_KEY: { + struct btrfs_delayed_tree_ref *ref; + + ref = btrfs_delayed_node_to_tree_ref(node); + ret = __add_prelim_ref(prefs, ref->root, info_key, + ref->level + 1, ref->parent, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, + node->bytenr, + node->ref_mod * sgn); + break; + } + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_delayed_data_ref *ref; + struct btrfs_key key; + + ref = btrfs_delayed_node_to_data_ref(node); + + key.objectid = ref->objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = ref->offset; + ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ref->parent, node->bytenr, + node->ref_mod * sgn); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return 0; +} + +/* + * add all inline backrefs for bytenr to the list + */ +static int __add_inline_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int *info_level, + struct list_head *prefs) +{ + int ret = 0; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + unsigned long end; + struct btrfs_extent_item *ei; + u64 flags; + u64 item_size; + + /* + * enumerate all inline refs + */ + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + + item_size = btrfs_item_size_nr(leaf, slot); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + struct btrfs_disk_key disk_key; + + info = (struct btrfs_tree_block_info *)ptr; + *info_level = btrfs_tree_block_level(leaf, info); + btrfs_tree_block_key(leaf, info, &disk_key); + btrfs_disk_key_to_cpu(info_key, &disk_key); + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + while (ptr < end) { + struct btrfs_extent_inline_ref *iref; + u64 offset; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + + switch (type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + *info_level + 1, offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, offset, info_key, + *info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, + count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + ptr += btrfs_extent_inline_ref_size(type); + } + + return 0; +} + +/* + * add all non-inline backrefs for bytenr to the list + */ +static int __add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + struct btrfs_key *info_key, int info_level, + struct list_head *prefs) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key key; + + while (1) { + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + break; + if (ret) { + ret = 0; + break; + } + + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid != bytenr) + break; + if (key.type < BTRFS_TREE_BLOCK_REF_KEY) + continue; + if (key.type > BTRFS_SHARED_DATA_REF_KEY) + break; + + switch (key.type) { + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, 0, info_key, + info_level + 1, key.offset, + bytenr, 1); + break; + case BTRFS_SHARED_DATA_REF_KEY: { + struct btrfs_shared_data_ref *sdref; + int count; + + sdref = btrfs_item_ptr(leaf, slot, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sdref); + ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, + bytenr, count); + break; + } + case BTRFS_TREE_BLOCK_REF_KEY: + ret = __add_prelim_ref(prefs, key.offset, info_key, + info_level + 1, 0, bytenr, 1); + break; + case BTRFS_EXTENT_DATA_REF_KEY: { + struct btrfs_extent_data_ref *dref; + int count; + u64 root; + + dref = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_data_ref); + count = btrfs_extent_data_ref_count(leaf, dref); + key.objectid = btrfs_extent_data_ref_objectid(leaf, + dref); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = btrfs_extent_data_ref_offset(leaf, dref); + root = btrfs_extent_data_ref_root(leaf, dref); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); + break; + } + default: + WARN_ON(1); + } + BUG_ON(ret); + } + + return ret; +} + +/* + * this adds all existing backrefs (inline backrefs, backrefs and delayed + * refs) for the given bytenr to the refs list, merges duplicates and resolves + * indirect refs to their parent bytenr. + * When roots are found, they're added to the roots list + * + * FIXME some caching might speed things up + */ +static int find_parent_nodes(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 seq, struct ulist *refs, struct ulist *roots) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_key info_key = { 0 }; + struct btrfs_delayed_ref_root *delayed_refs = NULL; + struct btrfs_delayed_ref_head *head; + int info_level = 0; + int ret; + struct list_head prefs_delayed; + struct list_head prefs; + struct __prelim_ref *ref; + + INIT_LIST_HEAD(&prefs); + INIT_LIST_HEAD(&prefs_delayed); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * grab both a lock on the path and a lock on the delayed ref head. + * We need both to get a consistent picture of how the refs look + * at a specified point in time + */ +again: + head = NULL; + + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + /* + * look if there are updates for this ref queued and lock the head + */ + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(path); + + /* + * Mutex was contended, block until it's + * released and try again + */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); + if (ret) { + spin_unlock(&delayed_refs->lock); + goto out; + } + } + spin_unlock(&delayed_refs->lock); + + if (path->slots[0]) { + struct extent_buffer *leaf; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = __add_inline_refs(fs_info, path, bytenr, + &info_key, &info_level, &prefs); + if (ret) + goto out; + ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + info_level, &prefs); + if (ret) + goto out; + } + } + btrfs_release_path(path); + + /* + * when adding the delayed refs above, the info_key might not have + * been known yet. Go over the list and replace the missing keys + */ + list_for_each_entry(ref, &prefs_delayed, list) { + if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) + memcpy(&ref->key, &info_key, sizeof(ref->key)); + } + list_splice_init(&prefs_delayed, &prefs); + + ret = __merge_refs(&prefs, 1); + if (ret) + goto out; + + ret = __resolve_indirect_refs(fs_info, &prefs); + if (ret) + goto out; + + ret = __merge_refs(&prefs, 2); + if (ret) + goto out; + + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + if (ref->count < 0) + WARN_ON(1); + if (ref->count && ref->root_id && ref->parent == 0) { + /* no parent == root of tree */ + ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + if (ref->count && ref->parent) { + ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + BUG_ON(ret < 0); + } + kfree(ref); + } + +out: + if (head) + mutex_unlock(&head->mutex); + btrfs_free_path(path); + while (!list_empty(&prefs)) { + ref = list_first_entry(&prefs, struct __prelim_ref, list); + list_del(&ref->list); + kfree(ref); + } + while (!list_empty(&prefs_delayed)) { + ref = list_first_entry(&prefs_delayed, struct __prelim_ref, + list); + list_del(&ref->list); + kfree(ref); + } + + return ret; +} + +/* + * Finds all leafs with a reference to the specified combination of bytenr and + * offset. key_list_head will point to a list of corresponding keys (caller must + * free each list element). The leafs will be stored in the leafs ulist, which + * must be freed with ulist_free. + * + * returns 0 on success, <0 on error + */ +static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **leafs) +{ + struct ulist *tmp; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *leafs = ulist_alloc(GFP_NOFS); + if (!*leafs) { + ulist_free(tmp); + return -ENOMEM; + } + + ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ulist_free(tmp); + + if (ret < 0 && ret != -ENOENT) { + ulist_free(*leafs); + return ret; + } + + return 0; +} + +/* + * walk all backrefs for a given extent to find all roots that reference this + * extent. Walking a backref means finding all extents that reference this + * extent and in turn walk the backrefs of those, too. Naturally this is a + * recursive process, but here it is implemented in an iterative fashion: We + * find all referencing extents for the extent in question and put them on a + * list. In turn, we find all referencing extents for those, further appending + * to the list. The way we iterate the list allows adding more elements after + * the current while iterating. The process stops when we reach the end of the + * list. Found roots are added to the roots list. + * + * returns 0 on success, < 0 on error. + */ +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots) +{ + struct ulist *tmp; + struct ulist_node *node = NULL; + int ret; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + *roots = ulist_alloc(GFP_NOFS); + if (!*roots) { + ulist_free(tmp); + return -ENOMEM; + } + + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, seq, + tmp, *roots); + if (ret < 0 && ret != -ENOENT) { + ulist_free(tmp); + ulist_free(*roots); + return ret; + } + node = ulist_next(tmp, node); + if (!node) + break; + bytenr = node->val; + } + + ulist_free(tmp); + return 0; +} + static int __inode_info(u64 inum, u64 ioff, u8 key_type, struct btrfs_root *fs_root, struct btrfs_path *path, @@ -121,6 +896,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, if (eb != eb_in) free_extent_buffer(eb); ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; if (ret) break; next_inum = found_key.offset; @@ -181,8 +958,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type != BTRFS_EXTENT_ITEM_KEY || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) + found_key->objectid + found_key->offset <= logical) { + pr_debug("logical %llu is not within any extent\n", + (unsigned long long)logical); return -ENOENT; + } eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); @@ -191,6 +971,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); + pr_debug("logical %llu is at position %llu within the extent (%llu " + "EXTENT_ITEM %llu) flags %#llx size %u\n", + (unsigned long long)logical, + (unsigned long long)(logical - found_key->objectid), + (unsigned long long)found_key->objectid, + (unsigned long long)found_key->offset, + (unsigned long long)flags, item_size); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return BTRFS_EXTENT_FLAG_TREE_BLOCK; if (flags & BTRFS_EXTENT_FLAG_DATA) @@ -287,128 +1074,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int __data_list_add(struct list_head *head, u64 inum, - u64 extent_data_item_offset, u64 root) -{ - struct __data_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->inum = inum; - ref->extent_data_item_offset = extent_data_item_offset; - ref->root = root; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, - struct btrfs_extent_data_ref *dref) -{ - return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), - btrfs_extent_data_ref_offset(eb, dref), - btrfs_extent_data_ref_root(eb, dref)); -} - -static int __shared_list_add(struct list_head *head, u64 disk_byte) -{ - struct __shared_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - ref->disk_byte = disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, - u64 logical, u64 inum, - u64 extent_data_item_offset, - u64 extent_offset, - struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) -{ - u64 ref_root; - u32 item_size; - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *eiref; - struct __data_ref *ref; - int ret; - int type; - int last; - unsigned long ptr = 0; - - WARN_ON(!list_empty(data_refs)); - ret = extent_from_logical(fs_info, logical, path, &key); - if (ret & BTRFS_EXTENT_FLAG_DATA) - ret = -EIO; - if (ret < 0) - goto out; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - ret = 0; - ref_root = 0; - /* - * as done in iterate_extent_inodes, we first build a list of refs to - * iterate, then free the path and then iterate them to avoid deadlocks. - */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last < 0) { - ret = last; - goto out; - } - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) { - ref_root = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __data_list_add(data_refs, inum, - extent_data_item_offset, - ref_root); - } - } while (!ret && !last); - - btrfs_release_path(path); - - if (ref_root == 0) { - printk(KERN_ERR "btrfs: failed to find tree block ref " - "for shared data backref %llu\n", logical); - WARN_ON(1); - ret = -EIO; - } - -out: - while (!list_empty(data_refs)) { - ref = list_first_entry(data_refs, struct __data_ref, list); - list_del(&ref->list); - if (!ret) - ret = iterate(ref->inum, extent_offset + - ref->extent_data_item_offset, - ref->root, ctx); - kfree(ref); - } - - return ret; -} - -static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, - u64 logical, u64 orig_extent_item_objectid, - u64 extent_offset, struct btrfs_path *path, - struct list_head *data_refs, - iterate_extent_inodes_t *iterate, - void *ctx) +static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 logical, + u64 orig_extent_item_objectid, + u64 extent_item_pos, u64 root, + iterate_extent_inodes_t *iterate, void *ctx) { u64 disk_byte; struct btrfs_key key; @@ -416,8 +1086,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int slot; int nritems; - int ret; - int found = 0; + int ret = 0; + int extent_type; + u64 data_offset; + u64 data_len; eb = read_tree_block(fs_info->tree_root, logical, fs_info->tree_root->leafsize, 0); @@ -435,149 +1107,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - if (!fi) { - free_extent_buffer(eb); - return -EIO; - } + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) { - if (found) - break; - else - continue; - } - ++found; - ret = __iter_shared_inline_ref_inodes(fs_info, logical, - key.objectid, - key.offset, - extent_offset, path, - data_refs, - iterate, ctx); - if (ret) - break; - } + if (disk_byte != orig_extent_item_objectid) + continue; - if (!found) { - printk(KERN_ERR "btrfs: failed to follow shared data backref " - "to parent %llu\n", logical); - WARN_ON(1); - ret = -EIO; + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + continue; + + pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " + "root %llu\n", orig_extent_item_objectid, + key.objectid, key.offset, root); + ret = iterate(key.objectid, + key.offset + (extent_item_pos - data_offset), + root, ctx); + if (ret) { + pr_debug("stopping iteration because ret=%d\n", ret); + break; + } } free_extent_buffer(eb); + return ret; } /* * calls iterate() for every inode that references the extent identified by - * the given parameters. will use the path given as a parameter and return it - * released. + * the given parameters. * when the iterator function returns a non-zero value, iteration stops. + * path is guaranteed to be in released state when iterate() is called. */ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u64 extent_item_objectid, - u64 extent_offset, + u64 extent_item_objectid, u64 extent_item_pos, iterate_extent_inodes_t *iterate, void *ctx) { - unsigned long ptr = 0; - int last; int ret; - int type; - u64 logical; - u32 item_size; - struct btrfs_extent_inline_ref *eiref; - struct btrfs_extent_data_ref *dref; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_key key; struct list_head data_refs = LIST_HEAD_INIT(data_refs); struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct __data_ref *ref_d; - struct __shared_ref *ref_s; + struct btrfs_trans_handle *trans; + struct ulist *refs; + struct ulist *roots; + struct ulist_node *ref_node = NULL; + struct ulist_node *root_node = NULL; + struct seq_list seq_elem; + struct btrfs_delayed_ref_root *delayed_refs; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + pr_debug("resolving all inodes for extent %llu\n", + extent_item_objectid); + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + btrfs_get_delayed_seq(delayed_refs, &seq_elem); + spin_unlock(&delayed_refs->lock); + + ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, + extent_item_pos, seq_elem.seq, + &refs); - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - /* first we iterate the inline refs, ... */ - do { - last = __get_extent_inline_ref(&ptr, eb, ei, item_size, - &eiref, &type); - if (last == -ENOENT) { - ret = 0; - break; - } - if (last < 0) { - ret = last; - break; - } - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&eiref->offset); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - logical = btrfs_extent_inline_ref_offset(eb, eiref); - ret = __shared_list_add(&shared_refs, logical); - } - } while (!ret && !last); + if (ret) + goto out; - /* ... then we proceed to in-tree references and ... */ - while (!ret) { - ++path->slots[0]; - if (path->slots[0] > btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret) { - if (ret == 1) - ret = 0; /* we're done */ - break; - } - eb = path->nodes[0]; - } - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_item_objectid) + while (!ret && (ref_node = ulist_next(refs, ref_node))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, + seq_elem.seq, &roots); + if (ret) break; - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = __data_list_add_eb(&data_refs, eb, dref); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ret = __shared_list_add(&shared_refs, key.offset); + while (!ret && (root_node = ulist_next(roots, root_node))) { + pr_debug("root %llu references leaf %llu\n", + root_node->val, ref_node->val); + ret = iterate_leaf_refs(fs_info, path, ref_node->val, + extent_item_objectid, + extent_item_pos, root_node->val, + iterate, ctx); } } - btrfs_release_path(path); - - /* - * ... only at the very end we can process the refs we found. this is - * because the iterator function we call is allowed to make tree lookups - * and we have to avoid deadlocks. additionally, we need more tree - * lookups ourselves for shared data refs. - */ - while (!list_empty(&data_refs)) { - ref_d = list_first_entry(&data_refs, struct __data_ref, list); - list_del(&ref_d->list); - if (!ret) - ret = iterate(ref_d->inum, extent_offset + - ref_d->extent_data_item_offset, - ref_d->root, ctx); - kfree(ref_d); - } - - while (!list_empty(&shared_refs)) { - ref_s = list_first_entry(&shared_refs, struct __shared_ref, - list); - list_del(&ref_s->list); - if (!ret) - ret = __iter_shared_inline_ref(fs_info, - ref_s->disk_byte, - extent_item_objectid, - extent_offset, path, - &data_refs, - iterate, ctx); - kfree(ref_s); - } - + ulist_free(refs); + ulist_free(roots); +out: + btrfs_put_delayed_seq(delayed_refs, &seq_elem); + btrfs_end_transaction(trans, fs_info->extent_root); return ret; } @@ -586,19 +1208,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - u64 offset; + u64 extent_item_pos; struct btrfs_key found_key; ret = extent_from_logical(fs_info, logical, path, &found_key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -EINVAL; if (ret < 0) return ret; - offset = logical - found_key.objectid; + extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, path, found_key.objectid, - offset, iterate, ctx); + extent_item_pos, iterate, ctx); return ret; } @@ -643,6 +1266,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ + pr_debug("following ref at offset %u for inode %llu in " + "tree %llu\n", cur, + (unsigned long long)found_key.objectid, + (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); if (ret) { free_extent_buffer(eb); @@ -683,10 +1310,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { + pr_debug("path resolved: %s\n", fspath); ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { + pr_debug("missed path, not enough space. missing bytes: %lu, " + "constructed so far: %s\n", + (unsigned long)(fspath_min - fspath), fspath_min); ++ipath->fspath->elem_missed; ipath->fspath->bytes_missing += fspath_min - fspath; ipath->fspath->bytes_left = 0; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 92618837cb8..d00dfa9ca93 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -20,6 +20,7 @@ #define __BTRFS_BACKREF__ #include "ioctl.h" +#include "ulist.h" struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 seq, struct ulist **roots); + struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 634608d2a6d..9b9b15fd520 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -51,6 +51,9 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* held while doing delalloc reservations */ + struct mutex delalloc_mutex; + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c new file mode 100644 index 00000000000..d986824bb2b --- /dev/null +++ b/fs/btrfs/check-integrity.c @@ -0,0 +1,3069 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* + * This module can be used to catch cases when the btrfs kernel + * code executes write requests to the disk that bring the file + * system in an inconsistent state. In such a state, a power-loss + * or kernel panic event would cause that the data on disk is + * lost or at least damaged. + * + * Code is added that examines all block write requests during + * runtime (including writes of the super block). Three rules + * are verified and an error is printed on violation of the + * rules: + * 1. It is not allowed to write a disk block which is + * currently referenced by the super block (either directly + * or indirectly). + * 2. When a super block is written, it is verified that all + * referenced (directly or indirectly) blocks fulfill the + * following requirements: + * 2a. All referenced blocks have either been present when + * the file system was mounted, (i.e., they have been + * referenced by the super block) or they have been + * written since then and the write completion callback + * was called and a FLUSH request to the device where + * these blocks are located was received and completed. + * 2b. All referenced blocks need to have a generation + * number which is equal to the parent's number. + * + * One issue that was found using this module was that the log + * tree on disk became temporarily corrupted because disk blocks + * that had been in use for the log tree had been freed and + * reused too early, while being referenced by the written super + * block. + * + * The search term in the kernel log that can be used to filter + * on the existence of detected integrity issues is + * "btrfs: attempt". + * + * The integrity check is enabled via mount options. These + * mount options are only supported if the integrity check + * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. + * + * Example #1, apply integrity checks to all metadata: + * mount /dev/sdb1 /mnt -o check_int + * + * Example #2, apply integrity checks to all metadata and + * to data extents: + * mount /dev/sdb1 /mnt -o check_int_data + * + * Example #3, apply integrity checks to all metadata and dump + * the tree that the super block references to kernel messages + * each time after a super block was written: + * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 + * + * If the integrity check tool is included and activated in + * the mount options, plenty of kernel memory is used, and + * plenty of additional CPU cycles are spent. Enabling this + * functionality is not intended for normal use. In most + * cases, unless you are a btrfs developer who needs to verify + * the integrity of (super)-block write requests, do not + * enable the config option BTRFS_FS_CHECK_INTEGRITY to + * include and compile the integrity check tool. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/crc32c.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "extent_io.h" +#include "disk-io.h" +#include "volumes.h" +#include "print-tree.h" +#include "locking.h" +#include "check-integrity.h" + +#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 +#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 +#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 +#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 +#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 +#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 +#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, + * excluding " [...]" */ +#define BTRFSIC_BLOCK_SIZE PAGE_SIZE + +#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) + +/* + * The definition of the bitmask fields for the print_mask. + * They are specified with the mount option check_integrity_print_mask. + */ +#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 +#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 +#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 +#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 +#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 +#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 +#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 +#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 +#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 +#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 +#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 +#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 + +struct btrfsic_dev_state; +struct btrfsic_state; + +struct btrfsic_block { + u32 magic_num; /* only used for debug purposes */ + unsigned int is_metadata:1; /* if it is meta-data, not data-data */ + unsigned int is_superblock:1; /* if it is one of the superblocks */ + unsigned int is_iodone:1; /* if is done by lower subsystem */ + unsigned int iodone_w_error:1; /* error was indicated to endio */ + unsigned int never_written:1; /* block was added because it was + * referenced, not because it was + * written */ + unsigned int mirror_num:2; /* large enough to hold + * BTRFS_SUPER_MIRROR_MAX */ + struct btrfsic_dev_state *dev_state; + u64 dev_bytenr; /* key, physical byte num on disk */ + u64 logical_bytenr; /* logical byte num on disk */ + u64 generation; + struct btrfs_disk_key disk_key; /* extra info to print in case of + * issues, will not always be correct */ + struct list_head collision_resolving_node; /* list node */ + struct list_head all_blocks_node; /* list node */ + + /* the following two lists contain block_link items */ + struct list_head ref_to_list; /* list */ + struct list_head ref_from_list; /* list */ + struct btrfsic_block *next_in_same_bio; + void *orig_bio_bh_private; + union { + bio_end_io_t *bio; + bh_end_io_t *bh; + } orig_bio_bh_end_io; + int submit_bio_bh_rw; + u64 flush_gen; /* only valid if !never_written */ +}; + +/* + * Elements of this type are allocated dynamically and required because + * each block object can refer to and can be ref from multiple blocks. + * The key to lookup them in the hashtable is the dev_bytenr of + * the block ref to plus the one from the block refered from. + * The fact that they are searchable via a hashtable and that a + * ref_cnt is maintained is not required for the btrfs integrity + * check algorithm itself, it is only used to make the output more + * beautiful in case that an error is detected (an error is defined + * as a write operation to a block while that block is still referenced). + */ +struct btrfsic_block_link { + u32 magic_num; /* only used for debug purposes */ + u32 ref_cnt; + struct list_head node_ref_to; /* list node */ + struct list_head node_ref_from; /* list node */ + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block *block_ref_to; + struct btrfsic_block *block_ref_from; + u64 parent_generation; +}; + +struct btrfsic_dev_state { + u32 magic_num; /* only used for debug purposes */ + struct block_device *bdev; + struct btrfsic_state *state; + struct list_head collision_resolving_node; /* list node */ + struct btrfsic_block dummy_block_for_bio_bh_flush; + u64 last_flush_gen; + char name[BDEVNAME_SIZE]; +}; + +struct btrfsic_block_hashtable { + struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_link_hashtable { + struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; +}; + +struct btrfsic_dev_state_hashtable { + struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; +}; + +struct btrfsic_block_data_ctx { + u64 start; /* virtual bytenr */ + u64 dev_bytenr; /* physical bytenr on device */ + u32 len; + struct btrfsic_dev_state *dev; + char *data; + struct buffer_head *bh; /* do not use if set to NULL */ +}; + +/* This structure is used to implement recursion without occupying + * any stack space, refer to btrfsic_process_metablock() */ +struct btrfsic_stack_frame { + u32 magic; + u32 nr; + int error; + int i; + int limit_nesting; + int num_copies; + int mirror_num; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx *block_ctx; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfs_header *hdr; + struct btrfsic_stack_frame *prev; +}; + +/* Some state per mounted filesystem */ +struct btrfsic_state { + u32 print_mask; + int include_extent_data; + int csum_size; + struct list_head all_blocks_list; + struct btrfsic_block_hashtable block_hashtable; + struct btrfsic_block_link_hashtable block_link_hashtable; + struct btrfs_root *root; + u64 max_superblock_generation; + struct btrfsic_block *latest_superblock; +}; + +static void btrfsic_block_init(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_alloc(void); +static void btrfsic_block_free(struct btrfsic_block *b); +static void btrfsic_block_link_init(struct btrfsic_block_link *n); +static struct btrfsic_block_link *btrfsic_block_link_alloc(void); +static void btrfsic_block_link_free(struct btrfsic_block_link *n); +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h); +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h); +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h); +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h); +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices); +static int btrfsic_process_metablock(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + struct btrfs_header *hdr, + int limit_nesting, int force_iodone_flag); +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx + *block_ctx, u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation); +static int btrfsic_handle_extent_data(struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag); +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num); +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out); +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx); +static void btrfsic_dump_database(struct btrfsic_state *state); +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size); +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, u8 *mapped_data, + unsigned int len, struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw); +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const block, + struct btrfs_super_block *const super_hdr); +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); +static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level); +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level); +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l); +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block); +static void btrfsic_dump_tree(const struct btrfsic_state *state); +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level); +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation); +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created); +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev); +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data); + +static struct mutex btrfsic_mutex; +static int btrfsic_is_initialized; +static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; + + +static void btrfsic_block_init(struct btrfsic_block *b) +{ + b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; + b->dev_state = NULL; + b->dev_bytenr = 0; + b->logical_bytenr = 0; + b->generation = BTRFSIC_GENERATION_UNKNOWN; + b->disk_key.objectid = 0; + b->disk_key.type = 0; + b->disk_key.offset = 0; + b->is_metadata = 0; + b->is_superblock = 0; + b->is_iodone = 0; + b->iodone_w_error = 0; + b->never_written = 0; + b->mirror_num = 0; + b->next_in_same_bio = NULL; + b->orig_bio_bh_private = NULL; + b->orig_bio_bh_end_io.bio = NULL; + INIT_LIST_HEAD(&b->collision_resolving_node); + INIT_LIST_HEAD(&b->all_blocks_node); + INIT_LIST_HEAD(&b->ref_to_list); + INIT_LIST_HEAD(&b->ref_from_list); + b->submit_bio_bh_rw = 0; + b->flush_gen = 0; +} + +static struct btrfsic_block *btrfsic_block_alloc(void) +{ + struct btrfsic_block *b; + + b = kzalloc(sizeof(*b), GFP_NOFS); + if (NULL != b) + btrfsic_block_init(b); + + return b; +} + +static void btrfsic_block_free(struct btrfsic_block *b) +{ + BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); + kfree(b); +} + +static void btrfsic_block_link_init(struct btrfsic_block_link *l) +{ + l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; + l->ref_cnt = 1; + INIT_LIST_HEAD(&l->node_ref_to); + INIT_LIST_HEAD(&l->node_ref_from); + INIT_LIST_HEAD(&l->collision_resolving_node); + l->block_ref_to = NULL; + l->block_ref_from = NULL; +} + +static struct btrfsic_block_link *btrfsic_block_link_alloc(void) +{ + struct btrfsic_block_link *l; + + l = kzalloc(sizeof(*l), GFP_NOFS); + if (NULL != l) + btrfsic_block_link_init(l); + + return l; +} + +static void btrfsic_block_link_free(struct btrfsic_block_link *l) +{ + BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); + kfree(l); +} + +static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) +{ + ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; + ds->bdev = NULL; + ds->state = NULL; + ds->name[0] = '\0'; + INIT_LIST_HEAD(&ds->collision_resolving_node); + ds->last_flush_gen = 0; + btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); + ds->dummy_block_for_bio_bh_flush.is_iodone = 1; + ds->dummy_block_for_bio_bh_flush.dev_state = ds; +} + +static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) +{ + struct btrfsic_dev_state *ds; + + ds = kzalloc(sizeof(*ds), GFP_NOFS); + if (NULL != ds) + btrfsic_dev_state_init(ds); + + return ds; +} + +static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) +{ + BUG_ON(!(NULL == ds || + BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); + kfree(ds); +} + +static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_hashtable_add(struct btrfsic_block *b, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(b->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)b->dev_state->bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + + list_add(&b->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) +{ + list_del(&b->collision_resolving_node); +} + +static struct btrfsic_block *btrfsic_block_hashtable_lookup( + struct block_device *bdev, + u64 dev_bytenr, + struct btrfsic_block_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)bdev))) & + (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block *const b = + list_entry(elem, struct btrfsic_block, + collision_resolving_node); + + if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) + return b; + } + + return NULL; +} + +static void btrfsic_block_link_hashtable_init( + struct btrfsic_block_link_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_block_link_hashtable_add( + struct btrfsic_block_link *l, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ + ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ + ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ + ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) + & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + list_add(&l->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) +{ + list_del(&l->collision_resolving_node); +} + +static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( + struct block_device *bdev_ref_to, + u64 dev_bytenr_ref_to, + struct block_device *bdev_ref_from, + u64 dev_bytenr_ref_from, + struct btrfsic_block_link_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ + ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ + ((unsigned int)((uintptr_t)bdev_ref_to)) ^ + ((unsigned int)((uintptr_t)bdev_ref_from))) & + (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_block_link *const l = + list_entry(elem, struct btrfsic_block_link, + collision_resolving_node); + + BUG_ON(NULL == l->block_ref_to); + BUG_ON(NULL == l->block_ref_from); + if (l->block_ref_to->dev_state->bdev == bdev_ref_to && + l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && + l->block_ref_from->dev_state->bdev == bdev_ref_from && + l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) + return l; + } + + return NULL; +} + +static void btrfsic_dev_state_hashtable_init( + struct btrfsic_dev_state_hashtable *h) +{ + int i; + + for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) + INIT_LIST_HEAD(h->table + i); +} + +static void btrfsic_dev_state_hashtable_add( + struct btrfsic_dev_state *ds, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)ds->bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + + list_add(&ds->collision_resolving_node, h->table + hashval); +} + +static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) +{ + list_del(&ds->collision_resolving_node); +} + +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( + struct block_device *bdev, + struct btrfsic_dev_state_hashtable *h) +{ + const unsigned int hashval = + (((unsigned int)((uintptr_t)bdev)) & + (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + struct list_head *elem; + + list_for_each(elem, h->table + hashval) { + struct btrfsic_dev_state *const ds = + list_entry(elem, struct btrfsic_dev_state, + collision_resolving_node); + + if (ds->bdev == bdev) + return ds; + } + + return NULL; +} + +static int btrfsic_process_superblock(struct btrfsic_state *state, + struct btrfs_fs_devices *fs_devices) +{ + int ret = 0; + struct btrfs_super_block *selected_super; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + struct btrfsic_dev_state *selected_dev_state = NULL; + int pass; + + BUG_ON(NULL == state); + selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + if (NULL == selected_super) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return -1; + } + + list_for_each_entry(device, dev_head, dev_list) { + int i; + struct btrfsic_dev_state *dev_state; + + if (!device->bdev || !device->name) + continue; + + dev_state = btrfsic_dev_state_lookup(device->bdev); + BUG_ON(NULL == dev_state); + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + ret = btrfsic_process_superblock_dev_mirror( + state, dev_state, device, i, + &selected_dev_state, selected_super); + if (0 != ret && 0 == i) { + kfree(selected_super); + return ret; + } + } + } + + if (NULL == state->latest_superblock) { + printk(KERN_INFO "btrfsic: no superblock found!\n"); + kfree(selected_super); + return -1; + } + + state->csum_size = btrfs_super_csum_size(selected_super); + + for (pass = 0; pass < 3; pass++) { + int num_copies; + int mirror_num; + u64 next_bytenr; + + switch (pass) { + case 0: + next_bytenr = btrfs_super_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + next_bytenr = btrfs_super_chunk_root(selected_super); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + next_bytenr = btrfs_super_log_root(selected_super); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + struct btrfs_header *hdr; + + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(root @%llu," + " mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + kfree(selected_super); + return -1; + } + + next_block = btrfsic_block_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + &state->block_hashtable); + BUG_ON(NULL == next_block); + + l = btrfsic_block_link_hashtable_lookup( + tmp_next_block_ctx.dev->bdev, + tmp_next_block_ctx.dev_bytenr, + state->latest_superblock->dev_state-> + bdev, + state->latest_superblock->dev_bytenr, + &state->block_link_hashtable); + BUG_ON(NULL == l); + + ret = btrfsic_read_block(state, &tmp_next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read @logical %llu failed!\n", + (unsigned long long) + tmp_next_block_ctx.start); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + kfree(selected_super); + return -1; + } + + hdr = (struct btrfs_header *)tmp_next_block_ctx.data; + ret = btrfsic_process_metablock(state, + next_block, + &tmp_next_block_ctx, + hdr, + BTRFS_MAX_LEVEL + 3, 1); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + } + } + + kfree(selected_super); + return ret; +} + +static int btrfsic_process_superblock_dev_mirror( + struct btrfsic_state *state, + struct btrfsic_dev_state *dev_state, + struct btrfs_device *device, + int superblock_mirror_num, + struct btrfsic_dev_state **selected_dev_state, + struct btrfs_super_block *selected_super) +{ + struct btrfs_super_block *super_tmp; + u64 dev_bytenr; + struct buffer_head *bh; + struct btrfsic_block *superblock_tmp; + int pass; + struct block_device *const superblock_bdev = device->bdev; + + /* super block bytenr is always the unmapped device bytenr */ + dev_bytenr = btrfs_sb_offset(superblock_mirror_num); + bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (NULL == bh) + return -1; + super_tmp = (struct btrfs_super_block *) + (bh->b_data + (dev_bytenr & 4095)); + + if (btrfs_super_bytenr(super_tmp) != dev_bytenr || + strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, + sizeof(super_tmp->magic)) || + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + brelse(bh); + return 0; + } + + superblock_tmp = + btrfsic_block_hashtable_lookup(superblock_bdev, + dev_bytenr, + &state->block_hashtable); + if (NULL == superblock_tmp) { + superblock_tmp = btrfsic_block_alloc(); + if (NULL == superblock_tmp) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + brelse(bh); + return -1; + } + /* for superblock, only the dev_bytenr makes sense */ + superblock_tmp->dev_bytenr = dev_bytenr; + superblock_tmp->dev_state = dev_state; + superblock_tmp->logical_bytenr = dev_bytenr; + superblock_tmp->generation = btrfs_super_generation(super_tmp); + superblock_tmp->is_metadata = 1; + superblock_tmp->is_superblock = 1; + superblock_tmp->is_iodone = 1; + superblock_tmp->never_written = 0; + superblock_tmp->mirror_num = 1 + superblock_mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, device->name, + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); + list_add(&superblock_tmp->all_blocks_node, + &state->all_blocks_list); + btrfsic_block_hashtable_add(superblock_tmp, + &state->block_hashtable); + } + + /* select the one with the highest generation field */ + if (btrfs_super_generation(super_tmp) > + state->max_superblock_generation || + 0 == state->max_superblock_generation) { + memcpy(selected_super, super_tmp, sizeof(*selected_super)); + *selected_dev_state = dev_state; + state->max_superblock_generation = + btrfs_super_generation(super_tmp); + state->latest_superblock = superblock_tmp; + } + + for (pass = 0; pass < 3; pass++) { + u64 next_bytenr; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "initial root "; + next_bytenr = btrfs_super_root(super_tmp); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "initial chunk "; + next_bytenr = btrfs_super_chunk_root(super_tmp); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "initial log "; + next_bytenr = btrfs_super_log_root(super_tmp); + if (0 == next_bytenr) + continue; + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + + if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num)) { + printk(KERN_INFO "btrfsic: btrfsic_map_block(" + "bytenr @%llu, mirror %d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + brelse(bh); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, &tmp_next_block_ctx, + additional_string, 1, 1, 0, + mirror_num, NULL); + if (NULL == next_block) { + btrfsic_release_block_ctx(&tmp_next_block_ctx); + brelse(bh); + return -1; + } + + next_block->disk_key = tmp_disk_key; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, &tmp_next_block_ctx, + next_block, superblock_tmp, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) { + brelse(bh); + return -1; + } + } + } + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) + btrfsic_dump_tree_sub(state, superblock_tmp, 0); + + brelse(bh); + return 0; +} + +static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) +{ + struct btrfsic_stack_frame *sf; + + sf = kzalloc(sizeof(*sf), GFP_NOFS); + if (NULL == sf) + printk(KERN_INFO "btrfsic: alloc memory failed!\n"); + else + sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; + return sf; +} + +static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) +{ + BUG_ON(!(NULL == sf || + BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); + kfree(sf); +} + +static int btrfsic_process_metablock( + struct btrfsic_state *state, + struct btrfsic_block *const first_block, + struct btrfsic_block_data_ctx *const first_block_ctx, + struct btrfs_header *const first_hdr, + int first_limit_nesting, int force_iodone_flag) +{ + struct btrfsic_stack_frame initial_stack_frame = { 0 }; + struct btrfsic_stack_frame *sf; + struct btrfsic_stack_frame *next_stack; + + sf = &initial_stack_frame; + sf->error = 0; + sf->i = -1; + sf->limit_nesting = first_limit_nesting; + sf->block = first_block; + sf->block_ctx = first_block_ctx; + sf->next_block = NULL; + sf->hdr = first_hdr; + sf->prev = NULL; + +continue_with_new_stack_frame: + sf->block->generation = le64_to_cpu(sf->hdr->generation); + if (0 == sf->hdr->level) { + struct btrfs_leaf *const leafhdr = + (struct btrfs_leaf *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(leafhdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "leaf %llu items %d generation %llu" + " owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + sf->nr, + (unsigned long long) + le64_to_cpu(leafhdr->header.generation), + (unsigned long long) + le64_to_cpu(leafhdr->header.owner)); + } + +continue_with_current_leaf_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_item *disk_item = leafhdr->items + sf->i; + struct btrfs_disk_key *disk_key = &disk_item->key; + u8 type; + const u32 item_offset = le32_to_cpu(disk_item->offset); + + type = disk_key->type; + + if (BTRFS_ROOT_ITEM_KEY == type) { + const struct btrfs_root_item *const root_item = + (struct btrfs_root_item *) + (sf->block_ctx->data + + offsetof(struct btrfs_leaf, items) + + item_offset); + const u64 next_bytenr = + le64_to_cpu(root_item->bytenr); + + sf->error = + btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + disk_key, + le64_to_cpu(root_item-> + generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = + btrfsic_stack_frame_alloc(); + if (NULL == next_stack) { + btrfsic_release_block_ctx( + &sf-> + next_block_ctx); + goto one_stack_frame_backwards; + } + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = + &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + } else if (BTRFS_EXTENT_DATA_KEY == type && + state->include_extent_data) { + sf->error = btrfsic_handle_extent_data( + state, + sf->block, + sf->block_ctx, + item_offset, + force_iodone_flag); + if (sf->error) + goto one_stack_frame_backwards; + } + + goto continue_with_current_leaf_stack_frame; + } + } else { + struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; + + if (-1 == sf->i) { + sf->nr = le32_to_cpu(nodehdr->header.nritems); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "node %llu level %d items %d" + " generation %llu owner %llu\n", + (unsigned long long) + sf->block_ctx->start, + nodehdr->header.level, sf->nr, + (unsigned long long) + le64_to_cpu(nodehdr->header.generation), + (unsigned long long) + le64_to_cpu(nodehdr->header.owner)); + } + +continue_with_current_node_stack_frame: + if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { + sf->i++; + sf->num_copies = 0; + } + + if (sf->i < sf->nr) { + struct btrfs_key_ptr *disk_key_ptr = + nodehdr->ptrs + sf->i; + const u64 next_bytenr = + le64_to_cpu(disk_key_ptr->blockptr); + + sf->error = btrfsic_create_link_to_next_block( + state, + sf->block, + sf->block_ctx, + next_bytenr, + sf->limit_nesting, + &sf->next_block_ctx, + &sf->next_block, + force_iodone_flag, + &sf->num_copies, + &sf->mirror_num, + &disk_key_ptr->key, + le64_to_cpu(disk_key_ptr->generation)); + if (sf->error) + goto one_stack_frame_backwards; + + if (NULL != sf->next_block) { + struct btrfs_header *const next_hdr = + (struct btrfs_header *) + sf->next_block_ctx.data; + + next_stack = btrfsic_stack_frame_alloc(); + if (NULL == next_stack) + goto one_stack_frame_backwards; + + next_stack->i = -1; + next_stack->block = sf->next_block; + next_stack->block_ctx = &sf->next_block_ctx; + next_stack->next_block = NULL; + next_stack->hdr = next_hdr; + next_stack->limit_nesting = + sf->limit_nesting - 1; + next_stack->prev = sf; + sf = next_stack; + goto continue_with_new_stack_frame; + } + + goto continue_with_current_node_stack_frame; + } + } + +one_stack_frame_backwards: + if (NULL != sf->prev) { + struct btrfsic_stack_frame *const prev = sf->prev; + + /* the one for the initial block is freed in the caller */ + btrfsic_release_block_ctx(sf->block_ctx); + + if (sf->error) { + prev->error = sf->error; + btrfsic_stack_frame_free(sf); + sf = prev; + goto one_stack_frame_backwards; + } + + btrfsic_stack_frame_free(sf); + sf = prev; + goto continue_with_new_stack_frame; + } else { + BUG_ON(&initial_stack_frame != sf); + } + + return sf->error; +} + +static int btrfsic_create_link_to_next_block( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u64 next_bytenr, + int limit_nesting, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block **next_blockp, + int force_iodone_flag, + int *num_copiesp, int *mirror_nump, + struct btrfs_disk_key *disk_key, + u64 parent_generation) +{ + struct btrfsic_block *next_block = NULL; + int ret; + struct btrfsic_block_link *l; + int did_alloc_block_link; + int block_was_created; + + *next_blockp = NULL; + if (0 == *num_copiesp) { + *num_copiesp = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, *num_copiesp); + *mirror_nump = 1; + } + + if (*mirror_nump > *num_copiesp) + return 0; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_create_link_to_next_block(mirror_num=%d)\n", + *mirror_nump); + ret = btrfsic_map_block(state, next_bytenr, + BTRFSIC_BLOCK_SIZE, + next_block_ctx, *mirror_nump); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", + (unsigned long long)next_bytenr, *mirror_nump); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + next_block = btrfsic_block_lookup_or_add(state, + next_block_ctx, "referenced ", + 1, force_iodone_flag, + !force_iodone_flag, + *mirror_nump, + &block_was_created); + if (NULL == next_block) { + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + if (block_was_created) { + l = NULL; + next_block->generation = BTRFSIC_GENERATION_UNKNOWN; + } else { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block), + (unsigned long long)next_block->logical_bytenr); + } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)next_bytenr, + next_block_ctx->dev->name, + (unsigned long long)next_block_ctx->dev_bytenr, + *mirror_nump, + btrfsic_get_block_type(state, next_block)); + next_block->logical_bytenr = next_bytenr; + + next_block->mirror_num = *mirror_nump; + l = btrfsic_block_link_hashtable_lookup( + next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_link_hashtable); + } + + next_block->disk_key = *disk_key; + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + did_alloc_block_link = 1; + l->block_ref_to = next_block; + l->block_ref_from = block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + did_alloc_block_link = 0; + if (0 == limit_nesting) { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + } + + if (limit_nesting > 0 && did_alloc_block_link) { + ret = btrfsic_read_block(state, next_block_ctx); + if (ret < (int)BTRFSIC_BLOCK_SIZE) { + printk(KERN_INFO + "btrfsic: read block @logical %llu failed!\n", + (unsigned long long)next_bytenr); + btrfsic_release_block_ctx(next_block_ctx); + *next_blockp = NULL; + return -1; + } + + *next_blockp = next_block; + } else { + *next_blockp = NULL; + } + (*mirror_nump)++; + + return 0; +} + +static int btrfsic_handle_extent_data( + struct btrfsic_state *state, + struct btrfsic_block *block, + struct btrfsic_block_data_ctx *block_ctx, + u32 item_offset, int force_iodone_flag) +{ + int ret; + struct btrfs_file_extent_item *file_extent_item = + (struct btrfs_file_extent_item *)(block_ctx->data + + offsetof(struct btrfs_leaf, + items) + item_offset); + u64 next_bytenr = + le64_to_cpu(file_extent_item->disk_bytenr) + + le64_to_cpu(file_extent_item->offset); + u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); + u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfsic_block_link *l; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," + " offset = %llu, num_bytes = %llu\n", + file_extent_item->type, + (unsigned long long) + le64_to_cpu(file_extent_item->disk_bytenr), + (unsigned long long) + le64_to_cpu(file_extent_item->offset), + (unsigned long long) + le64_to_cpu(file_extent_item->num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || + ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) + return 0; + while (num_bytes > 0) { + u32 chunk_len; + int num_copies; + int mirror_num; + + if (num_bytes > BTRFSIC_BLOCK_SIZE) + chunk_len = BTRFSIC_BLOCK_SIZE; + else + chunk_len = num_bytes; + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + struct btrfsic_block_data_ctx next_block_ctx; + struct btrfsic_block *next_block; + int block_was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "btrfsic_handle_extent_data(" + "mirror_num=%d)\n", mirror_num); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO + "\tdisk_bytenr = %llu, num_bytes %u\n", + (unsigned long long)next_bytenr, + chunk_len); + ret = btrfsic_map_block(state, next_bytenr, + chunk_len, &next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &next_block_ctx, + "referenced ", + 0, + force_iodone_flag, + !force_iodone_flag, + mirror_num, + &block_was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&next_block_ctx); + return -1; + } + if (!block_was_created) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) { + printk(KERN_INFO + "Referenced block" + " @%llu (%s/%llu/%d)" + " found in hash table, D," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)next_bytenr, + next_block_ctx.dev->name, + (unsigned long long) + next_block_ctx.dev_bytenr, + mirror_num, + (unsigned long long) + next_block->logical_bytenr); + } + next_block->logical_bytenr = next_bytenr; + next_block->mirror_num = mirror_num; + } + + l = btrfsic_block_link_lookup_or_add(state, + &next_block_ctx, + next_block, block, + generation); + btrfsic_release_block_ctx(&next_block_ctx); + if (NULL == l) + return -1; + } + + next_bytenr += chunk_len; + num_bytes -= chunk_len; + } + + return 0; +} + +static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, + struct btrfsic_block_data_ctx *block_ctx_out, + int mirror_num) +{ + int ret; + u64 length; + struct btrfs_bio *multi = NULL; + struct btrfs_device *device; + + length = len; + ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, + bytenr, &length, &multi, mirror_num); + + device = multi->stripes[0].dev; + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + + if (0 == ret) + kfree(multi); + if (NULL == block_ctx_out->dev) { + ret = -ENXIO; + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); + } + + return ret; +} + +static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, + u32 len, struct block_device *bdev, + struct btrfsic_block_data_ctx *block_ctx_out) +{ + block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); + block_ctx_out->dev_bytenr = bytenr; + block_ctx_out->start = bytenr; + block_ctx_out->len = len; + block_ctx_out->data = NULL; + block_ctx_out->bh = NULL; + if (NULL != block_ctx_out->dev) { + return 0; + } else { + printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); + return -ENXIO; + } +} + +static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) +{ + if (NULL != block_ctx->bh) { + brelse(block_ctx->bh); + block_ctx->bh = NULL; + } +} + +static int btrfsic_read_block(struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx) +{ + block_ctx->bh = NULL; + if (block_ctx->dev_bytenr & 4095) { + printk(KERN_INFO + "btrfsic: read_block() with unaligned bytenr %llu\n", + (unsigned long long)block_ctx->dev_bytenr); + return -1; + } + if (block_ctx->len > 4096) { + printk(KERN_INFO + "btrfsic: read_block() with too huge size %d\n", + block_ctx->len); + return -1; + } + + block_ctx->bh = __bread(block_ctx->dev->bdev, + block_ctx->dev_bytenr >> 12, 4096); + if (NULL == block_ctx->bh) + return -1; + block_ctx->data = block_ctx->bh->b_data; + + return block_ctx->len; +} + +static void btrfsic_dump_database(struct btrfsic_state *state) +{ + struct list_head *elem_all; + + BUG_ON(NULL == state); + + printk(KERN_INFO "all_blocks_list:\n"); + list_for_each(elem_all, &state->all_blocks_list) { + const struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *elem_ref_from; + + printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + + list_for_each(elem_ref_to, &b_all->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " refers %u* to" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + } + + list_for_each(elem_ref_from, &b_all->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, + struct btrfsic_block_link, + node_ref_from); + + printk(KERN_INFO " %c @%llu (%s/%llu/%d)" + " is ref %u* from" + " %c @%llu (%s/%llu/%d)\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + } + + printk(KERN_INFO "\n"); + } +} + +/* + * Test whether the disk block contains a tree block (leaf or node) + * (note that this test fails for the super block) + */ +static int btrfsic_test_for_metadata(struct btrfsic_state *state, + const u8 *data, unsigned int size) +{ + struct btrfs_header *h; + u8 csum[BTRFS_CSUM_SIZE]; + u32 crc = ~(u32)0; + int fail = 0; + int crc_fail = 0; + + h = (struct btrfs_header *)data; + + if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + fail++; + + crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, csum); + if (memcmp(csum, h->csum, state->csum_size)) + crc_fail++; + + return fail || crc_fail; +} + +static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, + u8 *mapped_data, unsigned int len, + struct bio *bio, + int *bio_is_patched, + struct buffer_head *bh, + int submit_bio_bh_rw) +{ + int is_metadata; + struct btrfsic_block *block; + struct btrfsic_block_data_ctx block_ctx; + int ret; + struct btrfsic_state *state = dev_state->state; + struct block_device *bdev = dev_state->bdev; + + WARN_ON(len > PAGE_SIZE); + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); + if (NULL != bio_is_patched) + *bio_is_patched = 0; + + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, + &state->block_hashtable); + if (NULL != block) { + u64 bytenr = 0; + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + if (block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_super_block *) + mapped_data)->bytenr); + is_metadata = 1; + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { + printk(KERN_INFO + "[before new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } + if (is_metadata) { + if (!block->is_superblock) { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, + dev_state, + dev_bytenr, + mapped_data); + } + if (block->logical_bytenr != bytenr) { + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c," + " bytenr mismatch" + " (!= stored %llu).\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block), + (unsigned long long) + block->logical_bytenr); + block->logical_bytenr = bytenr; + } else if (state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } else { + bytenr = block->logical_bytenr; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d)" + " found in hash table, %c.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, block)); + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "ref_to_list: %cE, ref_from_list: %cE\n", + list_empty(&block->ref_to_list) ? ' ' : '!', + list_empty(&block->ref_from_list) ? ' ' : '!'); + if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), old(gen=%llu," + " objectid=%llu, type=%d, offset=%llu)," + " new(gen=%llu)," + " which is referenced by most recent superblock" + " (superblockgen=%llu)!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(block->disk_key.objectid), + block->disk_key.type, + (unsigned long long) + le64_to_cpu(block->disk_key.offset), + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation), + (unsigned long long) + state->max_superblock_generation); + btrfsic_dump_tree(state); + } + + if (!block->is_iodone && !block->never_written) { + printk(KERN_INFO "btrfs: attempt to overwrite %c-block" + " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," + " which is not yet iodone!\n", + btrfsic_get_block_type(state, block), + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + block->mirror_num, + (unsigned long long)block->generation, + (unsigned long long) + le64_to_cpu(((struct btrfs_header *) + mapped_data)->generation)); + /* it would not be safe to go on */ + btrfsic_dump_tree(state); + return; + } + + /* + * Clear all references of this block. Do not free + * the block itself even if is not referenced anymore + * because it still carries valueable information + * like whether it was ever written and IO completed. + */ + list_for_each_safe(elem_ref_to, tmp_ref_to, + &block->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + l->ref_cnt--; + if (0 == l->ref_cnt) { + list_del(&l->node_ref_to); + list_del(&l->node_ref_from); + btrfsic_block_link_hashtable_remove(l); + btrfsic_block_link_free(l); + } + } + + if (block->is_superblock) + ret = btrfsic_map_superblock(state, bytenr, len, + bdev, &block_ctx); + else + ret = btrfsic_map_block(state, bytenr, len, + &block_ctx, 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", (unsigned long long)bytenr); + return; + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + if (is_metadata || state->include_extent_data) { + block->never_written = 0; + block->iodone_w_error = 0; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = + bio->bi_private; + block->orig_bio_bh_end_io.bio = + bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io. + bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + } + + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (is_metadata) { + block->logical_bytenr = bytenr; + block->is_metadata = 1; + if (block->is_superblock) { + ret = btrfsic_process_written_superblock( + state, + block, + (struct btrfs_super_block *) + mapped_data); + if (state->print_mask & + BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { + printk(KERN_INFO + "[after new superblock is written]:\n"); + btrfsic_dump_tree_sub(state, block, 0); + } + } else { + block->mirror_num = 0; /* unknown */ + ret = btrfsic_process_metablock( + state, + block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, + 0, 0); + } + if (ret) + printk(KERN_INFO + "btrfsic: btrfsic_process_metablock" + "(root @%llu) failed!\n", + (unsigned long long)dev_bytenr); + } else { + block->is_metadata = 0; + block->mirror_num = 0; /* unknown */ + block->generation = BTRFSIC_GENERATION_UNKNOWN; + if (!state->include_extent_data + && list_empty(&block->ref_from_list)) { + /* + * disk block is overwritten with extent + * data (not meta data) and we are configured + * to not include extent data: take the + * chance and free the block's memory + */ + btrfsic_block_hashtable_remove(block); + list_del(&block->all_blocks_node); + btrfsic_block_free(block); + } + } + btrfsic_release_block_ctx(&block_ctx); + } else { + /* block has not been found in hash table */ + u64 bytenr; + + if (!is_metadata) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO "Written block (%s/%llu/?)" + " !found in hash table, D.\n", + dev_state->name, + (unsigned long long)dev_bytenr); + if (!state->include_extent_data) + return; /* ignore that written D block */ + + /* this is getting ugly for the + * include_extent_data case... */ + bytenr = 0; /* unknown */ + block_ctx.start = bytenr; + block_ctx.len = len; + block_ctx.bh = NULL; + } else { + bytenr = le64_to_cpu(((struct btrfs_header *) + mapped_data)->bytenr); + btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, + dev_bytenr, + mapped_data); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "Written block @%llu (%s/%llu/?)" + " !found in hash table, M.\n", + (unsigned long long)bytenr, + dev_state->name, + (unsigned long long)dev_bytenr); + + ret = btrfsic_map_block(state, bytenr, len, &block_ctx, + 0); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + return; + } + } + block_ctx.data = mapped_data; + /* the following is required in case of writes to mirrors, + * use the same that was used for the lookup */ + block_ctx.dev = dev_state; + block_ctx.dev_bytenr = dev_bytenr; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&block_ctx); + return; + } + block->dev_state = dev_state; + block->dev_bytenr = dev_bytenr; + block->logical_bytenr = bytenr; + block->is_metadata = is_metadata; + block->never_written = 0; + block->iodone_w_error = 0; + block->mirror_num = 0; /* unknown */ + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = submit_bio_bh_rw; + if (NULL != bio) { + block->is_iodone = 0; + BUG_ON(NULL == bio_is_patched); + if (!*bio_is_patched) { + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + *bio_is_patched = 1; + } else { + struct btrfsic_block *chained_block = + (struct btrfsic_block *) + bio->bi_private; + + BUG_ON(NULL == chained_block); + block->orig_bio_bh_private = + chained_block->orig_bio_bh_private; + block->orig_bio_bh_end_io.bio = + chained_block->orig_bio_bh_end_io.bio; + block->next_in_same_bio = chained_block; + bio->bi_private = block; + } + } else if (NULL != bh) { + block->is_iodone = 0; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } else { + block->is_iodone = 1; + block->orig_bio_bh_private = NULL; + block->orig_bio_bh_end_io.bio = NULL; + block->next_in_same_bio = NULL; + } + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New written %c-block @%llu (%s/%llu/%d)\n", + is_metadata ? 'M' : 'D', + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + + if (is_metadata) { + ret = btrfsic_process_metablock(state, block, + &block_ctx, + (struct btrfs_header *) + block_ctx.data, 0, 0); + if (ret) + printk(KERN_INFO + "btrfsic: process_metablock(root @%llu)" + " failed!\n", + (unsigned long long)dev_bytenr); + } + btrfsic_release_block_ctx(&block_ctx); + } +} + +static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + int iodone_w_error; + + /* mutex is not held! This is not save if IO is not yet completed + * on umount */ + iodone_w_error = 0; + if (bio_error_status) + iodone_w_error = 1; + + BUG_ON(NULL == block); + bp->bi_private = block->orig_bio_bh_private; + bp->bi_end_io = block->orig_bio_bh_end_io.bio; + + do { + struct btrfsic_block *next_block; + struct btrfsic_dev_state *const dev_state = block->dev_state; + + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + bio_error_status, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + next_block = block->next_in_same_bio; + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bio_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long) + dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is + * on disk */ + block->is_iodone = 1; /* for FLUSH, this releases the block */ + block = next_block; + } while (NULL != block); + + bp->bi_end_io(bp, bio_error_status); +} + +static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) +{ + struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; + int iodone_w_error = !uptodate; + struct btrfsic_dev_state *dev_state; + + BUG_ON(NULL == block); + dev_state = block->dev_state; + if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", + iodone_w_error, + btrfsic_get_block_type(dev_state->state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + + block->iodone_w_error = iodone_w_error; + if (block->submit_bio_bh_rw & REQ_FLUSH) { + dev_state->last_flush_gen++; + if ((dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) + printk(KERN_INFO + "bh_end_io() new %s flush_gen=%llu\n", + dev_state->name, + (unsigned long long)dev_state->last_flush_gen); + } + if (block->submit_bio_bh_rw & REQ_FUA) + block->flush_gen = 0; /* FUA completed means block is on disk */ + + bh->b_private = block->orig_bio_bh_private; + bh->b_end_io = block->orig_bio_bh_end_io.bh; + block->is_iodone = 1; /* for FLUSH, this releases the block */ + bh->b_end_io(bh, uptodate); +} + +static int btrfsic_process_written_superblock( + struct btrfsic_state *state, + struct btrfsic_block *const superblock, + struct btrfs_super_block *const super_hdr) +{ + int pass; + + superblock->generation = btrfs_super_generation(super_hdr); + if (!(superblock->generation > state->max_superblock_generation || + 0 == state->max_superblock_generation)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: superblock @%llu (%s/%llu/%d)" + " with old gen %llu <= %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + } else { + if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) + printk(KERN_INFO + "btrfsic: got new superblock @%llu (%s/%llu/%d)" + " with new gen %llu > %llu\n", + (unsigned long long)superblock->logical_bytenr, + superblock->dev_state->name, + (unsigned long long)superblock->dev_bytenr, + superblock->mirror_num, + (unsigned long long) + btrfs_super_generation(super_hdr), + (unsigned long long) + state->max_superblock_generation); + + state->max_superblock_generation = + btrfs_super_generation(super_hdr); + state->latest_superblock = superblock; + } + + for (pass = 0; pass < 3; pass++) { + int ret; + u64 next_bytenr; + struct btrfsic_block *next_block; + struct btrfsic_block_data_ctx tmp_next_block_ctx; + struct btrfsic_block_link *l; + int num_copies; + int mirror_num; + const char *additional_string = NULL; + struct btrfs_disk_key tmp_disk_key; + + tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_disk_key.offset = 0; + + switch (pass) { + case 0: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + additional_string = "root "; + next_bytenr = btrfs_super_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "root@%llu\n", + (unsigned long long)next_bytenr); + break; + case 1: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + additional_string = "chunk "; + next_bytenr = btrfs_super_chunk_root(super_hdr); + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "chunk@%llu\n", + (unsigned long long)next_bytenr); + break; + case 2: + tmp_disk_key.objectid = + cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + additional_string = "log "; + next_bytenr = btrfs_super_log_root(super_hdr); + if (0 == next_bytenr) + continue; + if (state->print_mask & + BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) + printk(KERN_INFO "log@%llu\n", + (unsigned long long)next_bytenr); + break; + } + + num_copies = + btrfs_num_copies(&state->root->fs_info->mapping_tree, + next_bytenr, PAGE_SIZE); + if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) + printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", + (unsigned long long)next_bytenr, num_copies); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + int was_created; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic_process_written_superblock(" + "mirror_num=%d)\n", mirror_num); + ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + &tmp_next_block_ctx, + mirror_num); + if (ret) { + printk(KERN_INFO + "btrfsic: btrfsic_map_block(@%llu," + " mirror=%d) failed!\n", + (unsigned long long)next_bytenr, + mirror_num); + return -1; + } + + next_block = btrfsic_block_lookup_or_add( + state, + &tmp_next_block_ctx, + additional_string, + 1, 0, 1, + mirror_num, + &was_created); + if (NULL == next_block) { + printk(KERN_INFO + "btrfsic: error, kmalloc failed!\n"); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + return -1; + } + + next_block->disk_key = tmp_disk_key; + if (was_created) + next_block->generation = + BTRFSIC_GENERATION_UNKNOWN; + l = btrfsic_block_link_lookup_or_add( + state, + &tmp_next_block_ctx, + next_block, + superblock, + BTRFSIC_GENERATION_UNKNOWN); + btrfsic_release_block_ctx(&tmp_next_block_ctx); + if (NULL == l) + return -1; + } + } + + if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { + WARN_ON(1); + btrfsic_dump_tree(state); + } + + return 0; +} + +static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, + struct btrfsic_block *const block, + int recursion_level) +{ + struct list_head *elem_ref_to; + int ret = 0; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* + * Note that this situation can happen and does not + * indicate an error in regular cases. It happens + * when disk blocks are freed and later reused. + * The check-integrity module is not aware of any + * block free operations, it just recognizes block + * write operations. Therefore it keeps the linkage + * information for a block until a block is + * rewritten. This can temporarily cause incorrect + * and even circular linkage informations. This + * causes no harm unless such blocks are referenced + * by the most recent super block. + */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 1).\n"); + + return ret; + } + + /* + * This algorithm is recursive because the amount of used stack + * space is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " %u* refers to %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + if (l->block_ref_to->never_written) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is never written!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (!l->block_ref_to->is_iodone) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not yet iodone!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); + ret = -1; + } else if (l->parent_generation != + l->block_ref_to->generation && + BTRFSIC_GENERATION_UNKNOWN != + l->parent_generation && + BTRFSIC_GENERATION_UNKNOWN != + l->block_ref_to->generation) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " with generation %llu !=" + " parent generation %llu!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)l->block_ref_to->generation, + (unsigned long long)l->parent_generation); + ret = -1; + } else if (l->block_ref_to->flush_gen > + l->block_ref_to->dev_state->last_flush_gen) { + printk(KERN_INFO "btrfs: attempt to write superblock" + " which references block %c @%llu (%s/%llu/%d)" + " which is not flushed out of disk's write cache" + " (block flush_gen=%llu," + " dev->flush_gen=%llu)!\n", + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long) + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, + (unsigned long long)block->flush_gen, + (unsigned long long) + l->block_ref_to->dev_state->last_flush_gen); + ret = -1; + } else if (-1 == btrfsic_check_all_ref_blocks(state, + l->block_ref_to, + recursion_level + + 1)) { + ret = -1; + } + } + + return ret; +} + +static int btrfsic_is_block_ref_by_superblock( + const struct btrfsic_state *state, + const struct btrfsic_block *block, + int recursion_level) +{ + struct list_head *elem_ref_from; + + if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { + /* refer to comment at "abort cyclic linkage (case 1)" */ + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "btrfsic: abort cyclic linkage (case 2).\n"); + + return 0; + } + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + list_for_each(elem_ref_from, &block->ref_from_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_from, struct btrfsic_block_link, + node_ref_from); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "rl=%d, %c @%llu (%s/%llu/%d)" + " is ref %u* from %c @%llu (%s/%llu/%d)\n", + recursion_level, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num, + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long) + l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long) + l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num); + if (l->block_ref_from->is_superblock && + state->latest_superblock->dev_bytenr == + l->block_ref_from->dev_bytenr && + state->latest_superblock->dev_state->bdev == + l->block_ref_from->dev_state->bdev) + return 1; + else if (btrfsic_is_block_ref_by_superblock(state, + l->block_ref_from, + recursion_level + + 1)) + return 1; + } + + return 0; +} + +static void btrfsic_print_add_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Add %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static void btrfsic_print_rem_link(const struct btrfsic_state *state, + const struct btrfsic_block_link *l) +{ + printk(KERN_INFO + "Rem %u* link from %c @%llu (%s/%llu/%d)" + " to %c @%llu (%s/%llu/%d).\n", + l->ref_cnt, + btrfsic_get_block_type(state, l->block_ref_from), + (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->dev_state->name, + (unsigned long long)l->block_ref_from->dev_bytenr, + l->block_ref_from->mirror_num, + btrfsic_get_block_type(state, l->block_ref_to), + (unsigned long long)l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, + (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num); +} + +static char btrfsic_get_block_type(const struct btrfsic_state *state, + const struct btrfsic_block *block) +{ + if (block->is_superblock && + state->latest_superblock->dev_bytenr == block->dev_bytenr && + state->latest_superblock->dev_state->bdev == block->dev_state->bdev) + return 'S'; + else if (block->is_superblock) + return 's'; + else if (block->is_metadata) + return 'M'; + else + return 'D'; +} + +static void btrfsic_dump_tree(const struct btrfsic_state *state) +{ + btrfsic_dump_tree_sub(state, state->latest_superblock, 0); +} + +static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, + const struct btrfsic_block *block, + int indent_level) +{ + struct list_head *elem_ref_to; + int indent_add; + static char buf[80]; + int cursor_position; + + /* + * Should better fill an on-stack buffer with a complete line and + * dump it at once when it is time to print a newline character. + */ + + /* + * This algorithm is recursive because the amount of used stack space + * is very small and the max recursion depth is limited. + */ + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + block->dev_state->name, + (unsigned long long)block->dev_bytenr, + block->mirror_num); + if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + return; + } + printk(buf); + indent_level += indent_add; + if (list_empty(&block->ref_to_list)) { + printk("\n"); + return; + } + if (block->mirror_num > 1 && + !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { + printk(" [...]\n"); + return; + } + + cursor_position = indent_level; + list_for_each(elem_ref_to, &block->ref_to_list) { + const struct btrfsic_block_link *const l = + list_entry(elem_ref_to, struct btrfsic_block_link, + node_ref_to); + + while (cursor_position < indent_level) { + printk(" "); + cursor_position++; + } + if (l->ref_cnt > 1) + indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); + else + indent_add = sprintf(buf, " --> "); + if (indent_level + indent_add > + BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { + printk("[...]\n"); + cursor_position = 0; + continue; + } + + printk(buf); + + btrfsic_dump_tree_sub(state, l->block_ref_to, + indent_level + indent_add); + cursor_position = 0; + } +} + +static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *next_block_ctx, + struct btrfsic_block *next_block, + struct btrfsic_block *from_block, + u64 parent_generation) +{ + struct btrfsic_block_link *l; + + l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, + next_block_ctx->dev_bytenr, + from_block->dev_state->bdev, + from_block->dev_bytenr, + &state->block_link_hashtable); + if (NULL == l) { + l = btrfsic_block_link_alloc(); + if (NULL == l) { + printk(KERN_INFO + "btrfsic: error, kmalloc" " failed!\n"); + return NULL; + } + + l->block_ref_to = next_block; + l->block_ref_from = from_block; + l->ref_cnt = 1; + l->parent_generation = parent_generation; + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + + list_add(&l->node_ref_to, &from_block->ref_to_list); + list_add(&l->node_ref_from, &next_block->ref_from_list); + + btrfsic_block_link_hashtable_add(l, + &state->block_link_hashtable); + } else { + l->ref_cnt++; + l->parent_generation = parent_generation; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_add_link(state, l); + } + + return l; +} + +static struct btrfsic_block *btrfsic_block_lookup_or_add( + struct btrfsic_state *state, + struct btrfsic_block_data_ctx *block_ctx, + const char *additional_string, + int is_metadata, + int is_iodone, + int never_written, + int mirror_num, + int *was_created) +{ + struct btrfsic_block *block; + + block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, + block_ctx->dev_bytenr, + &state->block_hashtable); + if (NULL == block) { + struct btrfsic_dev_state *dev_state; + + block = btrfsic_block_alloc(); + if (NULL == block) { + printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); + return NULL; + } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + if (NULL == dev_state) { + printk(KERN_INFO + "btrfsic: error, lookup dev_state failed!\n"); + btrfsic_block_free(block); + return NULL; + } + block->dev_state = dev_state; + block->dev_bytenr = block_ctx->dev_bytenr; + block->logical_bytenr = block_ctx->start; + block->is_metadata = is_metadata; + block->is_iodone = is_iodone; + block->never_written = never_written; + block->mirror_num = mirror_num; + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + printk(KERN_INFO + "New %s%c-block @%llu (%s/%llu/%d)\n", + additional_string, + btrfsic_get_block_type(state, block), + (unsigned long long)block->logical_bytenr, + dev_state->name, + (unsigned long long)block->dev_bytenr, + mirror_num); + list_add(&block->all_blocks_node, &state->all_blocks_list); + btrfsic_block_hashtable_add(block, &state->block_hashtable); + if (NULL != was_created) + *was_created = 1; + } else { + if (NULL != was_created) + *was_created = 0; + } + + return block; +} + +static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, + u64 bytenr, + struct btrfsic_dev_state *dev_state, + u64 dev_bytenr, char *data) +{ + int num_copies; + int mirror_num; + int ret; + struct btrfsic_block_data_ctx block_ctx; + int match = 0; + + num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, + bytenr, PAGE_SIZE); + + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) { + printk(KERN_INFO "btrfsic:" + " btrfsic_map_block(logical @%llu," + " mirror %d) failed!\n", + (unsigned long long)bytenr, mirror_num); + continue; + } + + if (dev_state->bdev == block_ctx.dev->bdev && + dev_bytenr == block_ctx.dev_bytenr) { + match++; + btrfsic_release_block_ctx(&block_ctx); + break; + } + btrfsic_release_block_ctx(&block_ctx); + } + + if (!match) { + printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," + " buffer->log_bytenr=%llu, submit_bio(bdev=%s," + " phys_bytenr=%llu)!\n", + (unsigned long long)bytenr, dev_state->name, + (unsigned long long)dev_bytenr); + for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { + ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + &block_ctx, mirror_num); + if (ret) + continue; + + printk(KERN_INFO "Read logical bytenr @%llu maps to" + " (%s/%llu/%d)\n", + (unsigned long long)bytenr, + block_ctx.dev->name, + (unsigned long long)block_ctx.dev_bytenr, + mirror_num); + } + WARN_ON(1); + } +} + +static struct btrfsic_dev_state *btrfsic_dev_state_lookup( + struct block_device *bdev) +{ + struct btrfsic_dev_state *ds; + + ds = btrfsic_dev_state_hashtable_lookup(bdev, + &btrfsic_dev_state_hashtable); + return ds; +} + +int btrfsic_submit_bh(int rw, struct buffer_head *bh) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return submit_bh(rw, bh); + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bh() might also be called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + + /* Only called to write the superblock (incl. FLUSH/FUA) */ + if (NULL != dev_state && + (rw & WRITE) && bh->b_size > 0) { + u64 dev_bytenr; + + dev_bytenr = 4096 * bh->b_blocknr; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," + " size=%lu, data=%p, bdev=%p)\n", + rw, (unsigned long)bh->b_blocknr, + (unsigned long long)dev_bytenr, + (unsigned long)bh->b_size, bh->b_data, + bh->b_bdev); + btrfsic_process_written_block(dev_state, dev_bytenr, + bh->b_data, bh->b_size, NULL, + NULL, bh, rw); + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bh->b_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bh(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bh->b_private; + block->orig_bio_bh_end_io.bh = bh->b_end_io; + block->next_in_same_bio = NULL; + bh->b_private = block; + bh->b_end_io = btrfsic_bh_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + return submit_bh(rw, bh); +} + +void btrfsic_submit_bio(int rw, struct bio *bio) +{ + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) { + submit_bio(rw, bio); + return; + } + + mutex_lock(&btrfsic_mutex); + /* since btrfsic_submit_bio() is also called before + * btrfsic_mount(), this might return NULL */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + if (NULL != dev_state && + (rw & WRITE) && NULL != bio->bi_io_vec) { + unsigned int i; + u64 dev_bytenr; + int bio_is_patched; + + dev_bytenr = 512 * bio->bi_sector; + bio_is_patched = 0; + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x, bi_vcnt=%u," + " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, + (unsigned long long)dev_bytenr, + bio->bi_bdev); + + for (i = 0; i < bio->bi_vcnt; i++) { + u8 *mapped_data; + + mapped_data = kmap(bio->bi_io_vec[i].bv_page); + if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE) == + (dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "#%u: page=%p, mapped=%p, len=%u," + " offset=%u\n", + i, bio->bi_io_vec[i].bv_page, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio->bi_io_vec[i].bv_offset); + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_data, + bio->bi_io_vec[i].bv_len, + bio, &bio_is_patched, + NULL, rw); + kunmap(bio->bi_io_vec[i].bv_page); + dev_bytenr += bio->bi_io_vec[i].bv_len; + } + } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + printk(KERN_INFO + "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + rw, bio->bi_bdev); + if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) + printk(KERN_INFO + "btrfsic_submit_bio(%s) with FLUSH" + " but dummy block already in use" + " (ignored)!\n", + dev_state->name); + } else { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = rw; + block->orig_bio_bh_private = bio->bi_private; + block->orig_bio_bh_end_io.bio = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } + } + mutex_unlock(&btrfsic_mutex); + + submit_bio(rw, bio); +} + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask) +{ + int ret; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + state = kzalloc(sizeof(*state), GFP_NOFS); + if (NULL == state) { + printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); + return -1; + } + + if (!btrfsic_is_initialized) { + mutex_init(&btrfsic_mutex); + btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); + btrfsic_is_initialized = 1; + } + mutex_lock(&btrfsic_mutex); + state->root = root; + state->print_mask = print_mask; + state->include_extent_data = including_extent_data; + state->csum_size = 0; + INIT_LIST_HEAD(&state->all_blocks_list); + btrfsic_block_hashtable_init(&state->block_hashtable); + btrfsic_block_link_hashtable_init(&state->block_link_hashtable); + state->max_superblock_generation = 0; + state->latest_superblock = NULL; + + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + char *p; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_alloc(); + if (NULL == ds) { + printk(KERN_INFO + "btrfs check-integrity: kmalloc() failed!\n"); + mutex_unlock(&btrfsic_mutex); + return -1; + } + ds->bdev = device->bdev; + ds->state = state; + bdevname(ds->bdev, ds->name); + ds->name[BDEVNAME_SIZE - 1] = '\0'; + for (p = ds->name; *p != '\0'; p++); + while (p > ds->name && *p != '/') + p--; + if (*p == '/') + p++; + strlcpy(ds->name, p, sizeof(ds->name)); + btrfsic_dev_state_hashtable_add(ds, + &btrfsic_dev_state_hashtable); + } + + ret = btrfsic_process_superblock(state, fs_devices); + if (0 != ret) { + mutex_unlock(&btrfsic_mutex); + btrfsic_unmount(root, fs_devices); + return ret; + } + + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) + btrfsic_dump_database(state); + if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) + btrfsic_dump_tree(state); + + mutex_unlock(&btrfsic_mutex); + return 0; +} + +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices) +{ + struct list_head *elem_all; + struct list_head *tmp_all; + struct btrfsic_state *state; + struct list_head *dev_head = &fs_devices->devices; + struct btrfs_device *device; + + if (!btrfsic_is_initialized) + return; + + mutex_lock(&btrfsic_mutex); + + state = NULL; + list_for_each_entry(device, dev_head, dev_list) { + struct btrfsic_dev_state *ds; + + if (!device->bdev || !device->name) + continue; + + ds = btrfsic_dev_state_hashtable_lookup( + device->bdev, + &btrfsic_dev_state_hashtable); + if (NULL != ds) { + state = ds->state; + btrfsic_dev_state_hashtable_remove(ds); + btrfsic_dev_state_free(ds); + } + } + + if (NULL == state) { + printk(KERN_INFO + "btrfsic: error, cannot find state information" + " on umount!\n"); + mutex_unlock(&btrfsic_mutex); + return; + } + + /* + * Don't care about keeping the lists' state up to date, + * just free all memory that was allocated dynamically. + * Free the blocks and the block_links. + */ + list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { + struct btrfsic_block *const b_all = + list_entry(elem_all, struct btrfsic_block, + all_blocks_node); + struct list_head *elem_ref_to; + struct list_head *tmp_ref_to; + + list_for_each_safe(elem_ref_to, tmp_ref_to, + &b_all->ref_to_list) { + struct btrfsic_block_link *const l = + list_entry(elem_ref_to, + struct btrfsic_block_link, + node_ref_to); + + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) + btrfsic_print_rem_link(state, l); + + l->ref_cnt--; + if (0 == l->ref_cnt) + btrfsic_block_link_free(l); + } + + if (b_all->is_iodone) + btrfsic_block_free(b_all); + else + printk(KERN_INFO "btrfs: attempt to free %c-block" + " @%llu (%s/%llu/%d) on umount which is" + " not yet iodone!\n", + btrfsic_get_block_type(state, b_all), + (unsigned long long)b_all->logical_bytenr, + b_all->dev_state->name, + (unsigned long long)b_all->dev_bytenr, + b_all->mirror_num); + } + + mutex_unlock(&btrfsic_mutex); + + kfree(state); +} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h new file mode 100644 index 00000000000..8b59175cc50 --- /dev/null +++ b/fs/btrfs/check-integrity.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) STRATO AG 2011. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_CHECK_INTEGRITY__) +#define __BTRFS_CHECK_INTEGRITY__ + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +int btrfsic_submit_bh(int rw, struct buffer_head *bh); +void btrfsic_submit_bio(int rw, struct bio *bio); +#else +#define btrfsic_submit_bh submit_bh +#define btrfsic_submit_bio submit_bio +#endif + +int btrfsic_mount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, + int including_extent_data, u32 print_mask); +void btrfsic_unmount(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices); + +#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d..d02c27cd14c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dede441bdee..0639a555e16 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0); + buf->start, 0, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); if (ret) return ret; @@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1); + ret = btrfs_inc_ref(trans, root, buf, 1, 1); BUG_ON(ret); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0); + ret = btrfs_dec_ref(trans, root, buf, 0, 1); BUG_ON(ret); - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); BUG_ON(ret); } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); } if (new_flags != 0) { @@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1); + ret = btrfs_inc_ref(trans, root, cow, 1, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0); + ret = btrfs_inc_ref(trans, root, cow, 0, 1); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, buf, 1); + ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); } clean_tree_block(trans, root, buf); @@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, 1); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref); + last_ref, 1); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ free_extent_buffer(mid); return 0; @@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1); + btrfs_free_tree_block(trans, root, right, 0, 1, 0); free_extent_buffer(right); right = NULL; } else { @@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1); + btrfs_free_tree_block(trans, root, mid, 0, 1, 0); free_extent_buffer(mid); mid = NULL; } else { @@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0); + level, root->node->start, 0, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + &disk_key, level, c->start, 0, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2970,7 +2970,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + &disk_key, 0, l->start, 0, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - btrfs_free_tree_block(trans, root, leaf, 0, 1); + btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); return 0; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323..80b6486fd5e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,9 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -692,6 +695,54 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + __le64 unused[8]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -751,14 +802,32 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_NR_RAID_TYPES 5 +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_NR_RAID_TYPES 5 + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) struct btrfs_block_group_item { __le64 used; @@ -817,7 +886,7 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full:1; + unsigned int full; }; /* @@ -916,6 +985,7 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -971,7 +1041,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:20; + unsigned long mount_opt:21; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1132,12 +1202,23 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; + /* + * these three are in extended format (availability of single + * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other + * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) + */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; + + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -1155,6 +1236,10 @@ struct btrfs_fs_info { int scrub_workers_refcnt; struct btrfs_workers scrub_workers; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* filesystem state */ u64 fs_state; @@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_BALANCE_ITEM_KEY 248 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_super_block */ +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); +} + +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, @@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } -static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } @@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size); + u64 hint, u64 empty_size, int for_cow); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref); + u64 parent, int last_ref, int for_cow); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); + struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, @@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + u64 root_objectid, u64 owner, u64 offset, int for_cow); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + ++p->slots[0]; + if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) + return btrfs_next_leaf(root, p); + return 0; +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); @@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_for_commit); kfree(fs_info); } +/** + * profile_is_valid - tests whether a given profile is valid and reduced + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile + */ +static inline int profile_is_valid(u64 flags, int extended) +{ + u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; + if (extended) + mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & mask) + return 0; + /* true if zero or exactly one bit set */ + return (flags & (~flags + 1)) == flags; +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 9c1eccc2c50..fe4cd0f1cef 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, + num_bytes, 1); item->bytes_reserved = num_bytes; + } return ret; } @@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_item", + item->key.objectid, item->bytes_reserved, + 0); btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - int release = false; + bool release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (ret == -EAGAIN) ret = -ENOSPC; - if (!ret) + if (!ret) { node->bytes_reserved = num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "delayed_inode", + btrfs_ino(inode), + num_bytes, 1); + } return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); @@ -707,11 +719,17 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ - if (!ret) + if (!ret) { + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; + } - if (release) + if (release) { + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, src_rsv, num_bytes); + } return ret; } @@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, return; rsv = &root->fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + node->inode_id, node->bytes_reserved, 0); btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - delayed_item->key.objectid = btrfs_ino(dir); btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); delayed_item->key.offset = index; @@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item->type = type; memcpy((char *)(dir_item + 1), name, name_len); + ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + /* + * we have reserved enough space when we start a new transaction, + * so reserving metadata failure is impossible + */ + BUG_ON(ret); + + mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 125cf76fcd0..66e4f29505a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + /* merging of sequenced refs is not allowed */ + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), @@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, /* * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot + * head if it was able to find one, or NULL if nothing was in that spot. + * If return_bigger is given, the next bigger entry is returned if no exact + * match is found. */ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_node **last) + struct btrfs_delayed_ref_node **last, + int return_bigger) { - struct rb_node *n = root->rb_node; + struct rb_node *n; struct btrfs_delayed_ref_node *entry; - int cmp; + int cmp = 0; +again: + n = root->rb_node; + entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); @@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, else return entry; } + if (entry && return_bigger) { + if (cmp > 0) { + n = rb_next(&entry->rb_node); + if (!n) + n = rb_first(root); + entry = rb_entry(n, struct btrfs_delayed_ref_node, + rb_node); + bytenr = entry->bytenr; + return_bigger = 0; + goto again; + } + return entry; + } return NULL; } @@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +{ + struct seq_list *elem; + + assert_spin_locked(&delayed_refs->lock); + if (list_empty(&delayed_refs->seq_head)) + return 0; + + elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", + seq, elem->seq, delayed_refs); + return 1; + } + return 0; +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 start) { @@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - find_ref_head(&delayed_refs->root, start, &ref); + find_ref_head(&delayed_refs->root, start + 1, &ref, 1); if (ref) { - struct btrfs_delayed_ref_node *tmp; - - node = rb_prev(&ref->rb_node); - while (node) { - tmp = rb_entry(node, - struct btrfs_delayed_ref_node, - rb_node); - if (tmp->bytenr < start) - break; - ref = tmp; - node = rb_prev(&ref->rb_node); - } node = &ref->rb_node; } else node = rb_first(&delayed_refs->root); @@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * this does all the dirty work in terms of maintaining the correct * overall modification count. */ -static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, +static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, int action, int is_data) @@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, ref->action = 0; ref->is_head = 1; ref->in_tree = 1; + ref->seq = 0; head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; @@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, /* * helper to insert a delayed tree ref into the rbtree. */ -static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action) + u64 ref_root, int level, int action, + int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_tree_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_TREE_BLOCK_REF_KEY; - } full_ref->level = level; trace_btrfs_delayed_tree_ref(ref, full_ref, action); @@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * helper to insert a delayed data ref into the rbtree. */ -static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, +static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, - int action) + int action, int for_cow) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; + u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, ref->is_head = 0; ref->in_tree = 1; + if (need_ref_seq(for_cow, ref_root)) + seq = inc_delayed_seq(delayed_refs); + ref->seq = seq; + full_ref = btrfs_delayed_node_to_data_ref(ref); - if (parent) { - full_ref->parent = parent; + full_ref->parent = parent; + full_ref->root = ref_root; + if (parent) ref->type = BTRFS_SHARED_DATA_REF_KEY; - } else { - full_ref->root = ref_root; + else ref->type = BTRFS_EXTENT_DATA_REF_KEY; - } + full_ref->objectid = owner; full_ref->offset = offset; @@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 0); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 0); BUG_ON(ret); - ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, level, action); + ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, level, action, + for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int for_cow) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; @@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, - action, 1); + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, + num_bytes, action, 1); BUG_ON(ret); - ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, owner, offset, action); + ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, + num_bytes, parent, ref_root, owner, offset, + action, for_cow); BUG_ON(ret); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { @@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); BUG_ON(ret); + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); return 0; } @@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e287e3b0eab..d8f244d9492 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node { /* the size of the extent */ u64 num_bytes; + /* seq number to keep track of insertion order */ + u64 seq; + /* ref count on this data structure */ atomic_t refs; @@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head { struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; int level; }; struct btrfs_delayed_data_ref { struct btrfs_delayed_ref_node node; - union { - u64 root; - u64 parent; - }; + u64 root; + u64 parent; u64 objectid; u64 offset; }; @@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * seq number of delayed refs. We need to know if a backref was being + * added before the currently processed ref or afterwards. + */ + u64 seq; + + /* + * seq_list holds a list of all seq numbers that are currently being + * added to the list. While walking backrefs (btrfs_find_all_roots, + * qgroups), which might take some time, no newer ref must be processed, + * as it might influence the outcome of the walk. + */ + struct list_head seq_head; + + /* + * when the only refs we have in the list must not be processed, we want + * to wait for more refs to show up or for the end of backref walking. + */ + wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, +int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_extent_op *extent_op, + int for_cow); +int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); @@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); + +struct seq_list { + struct list_head list; + u64 seq; +}; + +static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) +{ + assert_spin_locked(&delayed_refs->lock); + ++delayed_refs->seq; + return delayed_refs->seq; +} + +static inline void +btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + assert_spin_locked(&delayed_refs->lock); + elem->seq = delayed_refs->seq; + list_add_tail(&elem->list, &delayed_refs->seq_head); +} + +static inline void +btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + struct seq_list *elem) +{ + spin_lock(&delayed_refs->lock); + list_del(&elem->list); + wake_up(&delayed_refs->seq_wait); + spin_unlock(&delayed_refs->lock); +} + +int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, + u64 seq); + +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) +{ + if (for_cow) + return 0; + + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; + + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f44b3928dc2..534266fe505 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "tree-log.h" #include "free-space-cache.h" #include "inode-map.h" +#include "check-integrity.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -872,7 +873,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +889,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif @@ -960,6 +962,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; + /* + * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing + * slab allocation from alloc_extent_state down the callchain where + * it'd hit a BUG_ON as those flags are not allowed. + */ + gfp_flags &= ~GFP_SLAB_BUG_MASK; + ret = try_release_extent_state(map, tree, page, gfp_flags); if (!ret) return 0; @@ -1142,7 +1151,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_item_inserted = 0; root->orphan_cleanup_state = 0; - root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; @@ -1216,6 +1224,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); + if (root) + root->fs_info = fs_info; + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1223,7 +1239,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); @@ -1243,7 +1259,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); + BTRFS_TREE_LOG_OBJECTID, NULL, + 0, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1317,7 +1334,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, u32 blocksize; int ret = 0; - root = kzalloc(sizeof(*root), GFP_NOFS); + root = btrfs_alloc_root(fs_info); if (!root) return ERR_PTR(-ENOMEM); if (location->offset == (u64)-1) { @@ -1579,9 +1596,7 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(root->fs_info); } - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1635,9 +1650,7 @@ sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - if (freezing(current)) { - refrigerator(); - } else { + if (!try_to_freeze()) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && !btrfs_transaction_blocked(root->fs_info)) @@ -1877,9 +1890,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) { u32 sectorsize; u32 nodesize; @@ -1891,8 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; - struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *tree_root; struct btrfs_root *extent_root; struct btrfs_root *csum_root; struct btrfs_root *chunk_root; @@ -1903,16 +1916,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; - extent_root = fs_info->extent_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - csum_root = fs_info->csum_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - chunk_root = fs_info->chunk_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - dev_root = fs_info->dev_root = - kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); + extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); + csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); + chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); + dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - if (!extent_root || !csum_root || !chunk_root || !dev_root) { + if (!tree_root || !extent_root || !csum_root || + !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } @@ -2001,6 +2012,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif + + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2238,6 +2260,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_sb_buffer; } + if (sectorsize < PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size " + "found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -2270,9 +2298,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); @@ -2281,6 +2307,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices); + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); @@ -2321,9 +2353,6 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { @@ -2356,6 +2385,19 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "btrfs: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { @@ -2371,7 +2413,7 @@ retry_root_backup: btrfs_level_size(tree_root, btrfs_super_log_root_level(disk_super)); - log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; goto fail_trans_kthread; @@ -2426,13 +2468,17 @@ retry_root_backup: if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + + if (!err) + err = btrfs_recover_balance(fs_info->tree_root); + if (err) { close_ctree(tree_root); - return ERR_PTR(err); + return err; } } - return tree_root; + return 0; fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); @@ -2478,8 +2524,7 @@ fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_close_devices(fs_info->fs_devices); - free_fs_info(fs_info); - return ERR_PTR(err); + return err; recovery_tree_root: if (!btrfs_test_opt(tree_root, RECOVERY)) @@ -2634,7 +2679,7 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(WRITE_FUA, bh); if (ret) errors++; } @@ -2711,7 +2756,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = bio; bio_get(bio); - submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); return 0; } @@ -2975,6 +3020,9 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(root->fs_info); + btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ @@ -2982,7 +3030,7 @@ int close_ctree(struct btrfs_root *root) (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(root->fs_info); + btrfs_run_defrag_inodes(fs_info); /* * Here come 2 situations when btrfs is broken to flip readonly: @@ -3011,8 +3059,8 @@ int close_ctree(struct btrfs_root *root) btrfs_put_block_group_cache(fs_info); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); + kthread_stop(fs_info->transaction_kthread); + kthread_stop(fs_info->cleaner_kthread); fs_info->closing = 2; smp_mb(); @@ -3030,14 +3078,14 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->extent_root->commit_root); free_extent_buffer(fs_info->tree_root->node); free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - free_extent_buffer(root->fs_info->dev_root->node); - free_extent_buffer(root->fs_info->dev_root->commit_root); - free_extent_buffer(root->fs_info->csum_root->node); - free_extent_buffer(root->fs_info->csum_root->commit_root); + free_extent_buffer(fs_info->chunk_root->node); + free_extent_buffer(fs_info->chunk_root->commit_root); + free_extent_buffer(fs_info->dev_root->node); + free_extent_buffer(fs_info->dev_root->commit_root); + free_extent_buffer(fs_info->csum_root->node); + free_extent_buffer(fs_info->csum_root->commit_root); - btrfs_free_block_groups(root->fs_info); + btrfs_free_block_groups(fs_info); del_fs_roots(fs_info); @@ -3057,14 +3105,17 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - free_fs_info(fs_info); - return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c99d0a8f13f..e4bc4741319 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); -struct btrfs_root *open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); +int open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); int close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1b8dc33778f..5f77166fd01 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation, int check_generation) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; struct btrfs_key key; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5fbe576d2b..37e0a800d34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,23 +34,24 @@ #include "locking.h" #include "free-space-cache.h" -/* control flags for do_chunk_alloc's force field +/* + * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk * if we really need one. * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * * CHUNK_ALLOC_LIMITED means to only try and allocate one * if we have very few chunks already allocated. This is * used as part of the clustering code to help make sure * we have a good pool of storage to cluster in, without * filling the FS with empty chunks * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * */ enum { CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_FORCE = 1, - CHUNK_ALLOC_LIMITED = 2, + CHUNK_ALLOC_LIMITED = 1, + CHUNK_ALLOC_FORCE = 2, }; /* @@ -618,8 +619,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + u64 root_objectid, u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL); + BTRFS_ADD_DELAYED_REF, NULL, for_cow); } return ret; } @@ -2233,6 +2237,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } /* + * locked_ref is the head node, so we have to go one + * node back for any delayed ref updates + */ + ref = select_delayed_ref(locked_ref); + + if (ref && ref->seq && + btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + /* + * there are still refs with lower seq numbers in the + * process of being added. Don't run this ref yet. + */ + list_del_init(&locked_ref->cluster); + mutex_unlock(&locked_ref->mutex); + locked_ref = NULL; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + + /* * record the must insert reserved flag before we * drop the spin lock. */ @@ -2242,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); if (!ref) { /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, @@ -2267,9 +2288,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(extent_op); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; + goto next; } list_del_init(&locked_ref->cluster); @@ -2279,7 +2298,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + /* + * we modified num_entries, but as we're currently running + * delayed refs, skip + * wake_up(&delayed_refs->seq_wait); + * here. + */ spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, @@ -2289,13 +2313,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(ref); kfree(extent_op); count++; - +next: + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); cond_resched(); spin_lock(&delayed_refs->lock); } return count; } + +static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, + unsigned long num_refs) +{ + struct list_head *first_seq = delayed_refs->seq_head.next; + + spin_unlock(&delayed_refs->lock); + pr_debug("waiting for more refs (num %ld, first %p)\n", + num_refs, first_seq); + wait_event(delayed_refs->seq_wait, + num_refs != delayed_refs->num_entries || + delayed_refs->seq_head.next != first_seq); + pr_debug("done waiting for more refs (num %ld, first %p)\n", + delayed_refs->num_entries, delayed_refs->seq_head.next); + spin_lock(&delayed_refs->lock); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2311,15 +2356,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct list_head cluster; int ret; + u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; + unsigned long num_refs = 0; + int consider_waiting; if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; + do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), + CHUNK_ALLOC_NO_FORCE); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: + consider_waiting = 0; spin_lock(&delayed_refs->lock); if (count == 0) { count = delayed_refs->num_entries * 2; @@ -2336,11 +2389,35 @@ again: * of refs to process starting at the first one we are able to * lock */ + delayed_start = delayed_refs->run_delayed_start; ret = btrfs_find_ref_cluster(trans, &cluster, delayed_refs->run_delayed_start); if (ret) break; + if (delayed_start >= delayed_refs->run_delayed_start) { + if (consider_waiting == 0) { + /* + * btrfs_find_ref_cluster looped. let's do one + * more cycle. if we don't run any delayed ref + * during that cycle (because we can't because + * all of them are blocked) and if the number of + * refs doesn't change, we avoid busy waiting. + */ + consider_waiting = 1; + num_refs = delayed_refs->num_entries; + } else { + wait_for_more_refs(delayed_refs, num_refs); + /* + * after waiting, things have changed. we + * dropped the lock and someone else might have + * run some refs, built new clusters and so on. + * therefore, we restart staleness detection. + */ + consider_waiting = 0; + } + } + ret = run_clustered_refs(trans, root, &cluster); BUG_ON(ret < 0); @@ -2348,6 +2425,11 @@ again: if (count == 0) break; + + if (ret || delayed_refs->run_delayed_start == 0) { + /* refs were run, let's reset staleness detection */ + consider_waiting = 0; + } } if (run_all) { @@ -2405,7 +2487,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; - ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + num_bytes, extent_op); if (ret) kfree(extent_op); return ret; @@ -2590,7 +2673,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc) + int full_backref, int inc, int for_cow) { u64 bytenr; u64 num_bytes; @@ -2603,7 +2686,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64, int); ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); @@ -2640,14 +2723,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset); + key.offset, for_cow); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + parent, ref_root, level - 1, 0, + for_cow); if (ret) goto fail; } @@ -2659,15 +2743,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref) + struct extent_buffer *buf, int full_backref, int for_cow) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2993,9 +3077,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | - BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3016,20 +3098,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); - if (extra_flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; - } + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; } +/* + * @flags: available profiles in extended format (see ctree.h) + * + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. + */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* @@ -3040,6 +3129,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + /* pick restriper's target profile if it's available */ + spin_lock(&root->fs_info->balance_lock); + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && + (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->data.target)) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && + (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->sys.target)) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && + (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->meta.target)) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + spin_unlock(&root->fs_info->balance_lock); + flags = tgt; + goto out; + } + } + spin_unlock(&root->fs_info->balance_lock); + if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3059,22 +3176,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) if ((flags & BTRFS_BLOCK_GROUP_RAID0) && ((flags & BTRFS_BLOCK_GROUP_RAID1) | (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) + (flags & BTRFS_BLOCK_GROUP_DUP))) { flags &= ~BTRFS_BLOCK_GROUP_RAID0; + } + +out: + /* extended -> chunk profile */ + flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits & - root->fs_info->data_alloc_profile; + flags |= root->fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits & - root->fs_info->system_alloc_profile; + flags |= root->fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits & - root->fs_info->metadata_alloc_profile; + flags |= root->fs_info->avail_metadata_alloc_bits; + return btrfs_reduce_alloc_profile(root, flags); } @@ -3191,6 +3311,9 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)(unsigned long)data_sinfo, + bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3210,6 +3333,9 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + (u64)(unsigned long)data_sinfo, + bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3257,27 +3383,15 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_bytes - num_allocated < thresh) return 1; } - - /* - * we have two similar checks here, one based on percentage - * and once based on a hard number of 256MB. The idea - * is that if we have a good amount of free - * room, don't allocate a chunk. A good mount is - * less than 80% utilized of the chunks we have allocated, - * or more than 256MB free - */ - if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) - return 0; - - if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) - return 0; - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - /* 256MB or 5% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + /* 256MB or 2% of the FS */ + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); + /* system chunks need a much small threshold */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) + thresh = 32 * 1024 * 1024; - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; return 1; } @@ -3291,7 +3405,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; - flags = btrfs_reduce_alloc_profile(extent_root, flags); + BUG_ON(!profile_is_valid(flags, 0)); space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { @@ -3303,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, again: spin_lock(&space_info->lock); - if (space_info->force_alloc) + if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { spin_unlock(&space_info->lock); @@ -3499,12 +3613,15 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; + spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size < bytes) { + if (space_info->bytes_pinned + delayed_rsv->size < bytes) { spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); @@ -3582,6 +3699,10 @@ again: if (used <= space_info->total_bytes) { if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3649,6 +3770,10 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3755,7 +3880,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, +static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; @@ -3791,6 +3917,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3947,7 +4076,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root, if (global_rsv->full || global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, + num_bytes); } /* @@ -3980,7 +4110,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div64_u64(meta_used, 3) * 2; return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } @@ -4006,11 +4136,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)(unsigned long)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + (u64)(unsigned long)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4045,7 +4179,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, + (u64)-1); WARN_ON(fs_info->delalloc_block_rsv.size > 0); WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); @@ -4062,6 +4197,9 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)(unsigned long)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -4079,6 +4217,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * when we are truly done with the orphan item. */ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } @@ -4086,6 +4226,8 @@ void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); + trace_btrfs_space_reservation(root->fs_info, "orphan", + btrfs_ino(inode), num_bytes, 0); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } @@ -4213,12 +4355,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) /* Need to be holding the i_mutex here if we aren't free space cache */ if (btrfs_is_free_space_inode(root, inode)) flush = 0; - else - WARN_ON(!mutex_is_locked(&inode->i_mutex)); if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); + mutex_lock(&BTRFS_I(inode)->delalloc_mutex); num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); @@ -4266,8 +4407,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (to_free) + if (to_free) { btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, + "delalloc", + btrfs_ino(inode), + to_free, 0); + } + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); return ret; } @@ -4278,7 +4425,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + if (to_reserve) + trace_btrfs_space_reservation(root->fs_info,"delalloc", + btrfs_ino(inode), to_reserve, 1); block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; @@ -4308,6 +4459,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -4562,7 +4715,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { - BUG_ON(space_info->bytes_may_use < num_bytes); + trace_btrfs_space_reservation(cache->fs_info, + "space_info", + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } @@ -4928,6 +5084,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; + if (waitqueue_active(&delayed_refs->seq_wait)) + wake_up(&delayed_refs->seq_wait); /* * we don't take a ref on the node because we're removing it from the @@ -4955,16 +5113,17 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref) + u64 parent, int last_ref, int for_cow) { struct btrfs_block_group_cache *cache = NULL; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } @@ -4999,12 +5158,12 @@ out: btrfs_put_block_group(cache); } -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int for_cow) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; /* * tree log blocks never actually go into the extent allocation @@ -5016,14 +5175,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, + num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL); + BTRFS_DROP_DELAYED_REF, NULL, for_cow); BUG_ON(ret); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, + num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, + NULL, for_cow); BUG_ON(ret); } return ret; @@ -5146,6 +5308,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; + trace_find_free_extent(orig_root, num_bytes, empty_size, data); + space_info = __find_space_info(root->fs_info, data); if (!space_info) { printk(KERN_ERR "No space info for %llu\n", data); @@ -5295,15 +5459,6 @@ alloc: if (unlikely(block_group->ro)) goto loop; - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - /* * Ok we want to try and use the cluster allocator, so * lets look there @@ -5331,6 +5486,8 @@ alloc: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, num_bytes); goto checks; } @@ -5349,8 +5506,15 @@ refill_cluster: * plenty of times and not have found * anything, so we are likely way too * fragmented for the clustering stuff to find - * anything. */ - if (loop >= LOOP_NO_EMPTY_SIZE) { + * anything. + * + * However, if the cluster is taken from the + * current block group, release the cluster + * first, so that we stand a better chance of + * succeeding in the unclustered + * allocation. */ + if (loop >= LOOP_NO_EMPTY_SIZE && + last_ptr->block_group != block_group) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; } @@ -5361,6 +5525,11 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5377,6 +5546,9 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(root, + block_group, search_start, + num_bytes); goto checks; } } else if (!cached && loop > LOOP_CACHING_NOWAIT @@ -5401,6 +5573,15 @@ refill_cluster: } unclustered_alloc: + spin_lock(&block_group->free_space_ctl->tree_lock); + if (cached && + block_group->free_space_ctl->free_space < + num_bytes + empty_cluster + empty_size) { + spin_unlock(&block_group->free_space_ctl->tree_lock); + goto loop; + } + spin_unlock(&block_group->free_space_ctl->tree_lock); + offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5438,9 +5619,6 @@ checks: goto loop; } - ins->objectid = search_start; - ins->offset = num_bytes; - if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -5457,6 +5635,8 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; + trace_btrfs_reserve_extent(orig_root, block_group, + search_start, num_bytes); if (offset < search_start) btrfs_add_free_space(used_block_group, offset, search_start - offset); @@ -5621,6 +5801,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { + bool final_tried = false; int ret; u64 search_start = 0; @@ -5640,22 +5821,25 @@ again: search_start, search_end, hint_byte, ins, data); - if (ret == -ENOSPC && num_bytes > min_alloc_size) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); - num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - goto again; - } - if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { - struct btrfs_space_info *sinfo; - - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (ret == -ENOSPC) { + if (!final_tried) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, CHUNK_ALLOC_FORCE); + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes, 1); + } } trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); @@ -5842,9 +6026,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, - 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL); + ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ins->offset, 0, + root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL, 0); return ret; } @@ -5997,10 +6182,11 @@ use_block_rsv(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOSPC); } -static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +static void unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); } /* @@ -6014,7 +6200,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size) + u64 hint, u64 empty_size, int for_cow) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6030,7 +6216,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, empty_size, hint, (u64)-1, &ins, 0); if (ret) { - unuse_block_rsv(block_rsv, blocksize); + unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -6058,10 +6244,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->is_data = 0; - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); + extent_op, for_cow); BUG_ON(ret); } return buf; @@ -6078,6 +6265,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int for_reloc; }; #define DROP_REFERENCE 1 @@ -6216,9 +6404,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1); + ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); BUG_ON(ret); - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, 0); @@ -6362,7 +6550,7 @@ skip: } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); + root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); } btrfs_tree_unlock(next); @@ -6436,9 +6624,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1); + ret = btrfs_dec_ref(trans, root, eb, 1, + wc->for_reloc); else - ret = btrfs_dec_ref(trans, root, eb, 0); + ret = btrfs_dec_ref(trans, root, eb, 0, + wc->for_reloc); BUG_ON(ret); } /* make block locked assertion in clean_tree_block happy */ @@ -6465,7 +6655,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); out: wc->refs[level] = 0; wc->flags[level] = 0; @@ -6549,7 +6739,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * blocks are properly updated. */ void btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) + struct btrfs_block_rsv *block_rsv, int update_ref, + int for_reloc) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6637,6 +6828,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root, wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->for_reloc = for_reloc; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6721,6 +6913,7 @@ out: * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' + * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -6765,6 +6958,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->for_reloc = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { @@ -6792,6 +6986,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + /* pick restriper's target profile and return */ + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + /* extended -> chunk profile */ + tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + return tgt; + } + } + /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS @@ -7085,7 +7302,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7447,6 +7664,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -7466,6 +7684,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return 0; } +static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + /* chunk -> extended profile */ + if (extra_flags == 0) + extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits &= ~extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits &= ~extra_flags; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start) { @@ -7476,6 +7710,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_key key; struct inode *inode; int ret; + int index; int factor; root = root->fs_info->extent_root; @@ -7491,6 +7726,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, block_group); memcpy(&key, &block_group->key, sizeof(key)); + index = get_block_group_index(block_group); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -7565,6 +7801,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); + if (list_empty(&block_group->space_info->block_groups[index])) + clear_avail_alloc_bits(root->fs_info, block_group->flags); up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) @@ -7654,9 +7892,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - cache = btrfs_lookup_block_group(fs_info, range->start); + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9dc09f..a55fbe6252d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" +#include "check-integrity.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -512,6 +513,15 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + if (state->end < end && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) + goto next; + /* * | ---- desired range ---- | * | state | or @@ -564,20 +574,15 @@ hit_next: goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - set |= clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; if (start <= end && next_node) { state = rb_entry(next_node, struct extent_state, rb_node); - if (state->start == start) - goto hit_next; + goto hit_next; } goto search_again; @@ -960,8 +965,6 @@ hit_next: set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -1006,7 +1009,6 @@ hit_next: if (state->end <= end) { set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1067,8 +1069,6 @@ hit_next: set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } @@ -1895,7 +1895,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start-page_offset(page)); - submit_bio(WRITE_SYNC, bio); + btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2153,13 +2153,46 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(NULL, page, + start, end, NULL); + /* Writeback already completed */ + if (ret == 0) + return 1; + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2171,13 +2204,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; - int ret; do { struct page *page = bvec->bv_page; @@ -2194,28 +2225,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; if (whole_page) end_page_writeback(page); @@ -2393,7 +2405,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; @@ -2778,9 +2790,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + &nr_written); + BUG_ON(ret); /* * delalloc_end is already one less than the total * length, so we don't subtract one from @@ -2817,8 +2832,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; @@ -3288,7 +3307,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } @@ -3579,6 +3598,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->blocking_writers, 0); atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->spinning_writers, 0); + eb->lock_nested = 0; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -3851,10 +3871,9 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3907,6 +3926,8 @@ int extent_range_uptodate(struct extent_io_tree *tree, while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); + if (!page) + return 1; uptodate = PageUptodate(page); page_cache_release(page); if (!uptodate) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7604c300132..cecc3518c12 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -129,6 +129,7 @@ struct extent_buffer { struct list_head leak_list; struct rcu_head rcu_head; atomic_t refs; + pid_t lock_owner; /* count of read lock holders on the extent buffer */ atomic_t write_locks; @@ -137,6 +138,7 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; + int lock_nested; /* protects write locks */ rwlock_t lock; @@ -317,4 +319,5 @@ struct btrfs_mapping_tree; int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); #endif diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 33a7890b1f4..1195f09761f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,8 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree:1; - unsigned int compress_type:4; + unsigned int in_tree; + unsigned int compress_type; }; struct extent_map_tree { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c05..e8d06b6b919 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -678,7 +678,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset); + start - extent_offset, 0); BUG_ON(ret); *hint_byte = disk_bytenr; } @@ -753,7 +753,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset); + extent_offset, 0); BUG_ON(ret); inode_sub_bytes(inode, extent_end - key.offset); @@ -962,7 +962,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); if (split == start) { @@ -989,7 +989,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } other_start = 0; @@ -1006,7 +1006,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset); + ino, orig_offset, 0); BUG_ON(ret); } if (del_nr == 0) { @@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask); + mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1136,7 +1136,8 @@ again: GFP_NOFS); } for (i = 0; i < num_pages; i++) { - clear_page_dirty_for_io(pages[i]); + if (clear_page_dirty_for_io(pages[i])) + account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } @@ -1273,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, dirty_pages); if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); pos += copied; num_written += copied; @@ -1605,6 +1605,14 @@ static long btrfs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, len); + if (ret) + return ret; + + /* * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. */ @@ -1667,27 +1675,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - - /* - * Make sure we have enough space before we do the - * allocation. - */ - ret = btrfs_check_data_free_space(inode, last_byte - - cur_offset); - if (ret) { - free_extent_map(em); - break; - } - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); - /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, last_byte - - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1715,6 +1708,8 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, len); return ret; } @@ -1761,7 +1756,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) start - root->sectorsize, root->sectorsize, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); goto out; } last_end = em->start + em->len; @@ -1773,7 +1768,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) while (1) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec23d43d0c3..710ea380c7e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + if (io_ctl->pages[i]) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } } } @@ -423,7 +425,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) } if (index == 0) - offset = sizeof(u32) * io_ctl->num_pages;; + offset = sizeof(u32) * io_ctl->num_pages; crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); @@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return ret; + ret = readahead_cache(inode); if (ret) goto out; @@ -772,6 +777,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); + btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); @@ -838,7 +844,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; - u64 start, end, len; + u64 start, extent_start, extent_end, len; int entries = 0; int bitmaps = 0; int ret; @@ -849,7 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (!i_size_read(inode)) return -1; - io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root); + if (ret) + return -1; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -857,25 +865,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_cluster, block_group_list); - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - /* Lock all pages first so we can lock the extent safely. */ io_ctl_prepare_pages(&io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); - /* - * When searching for pinned extents, we need to start at our start - * offset. - */ - if (block_group) - start = block_group->key.objectid; - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); @@ -918,9 +913,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * We want to add any pinned extents to our free space cache * so we don't leak the space */ + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; + + if (block_group) + start = block_group->key.objectid; + while (block_group && (start < block_group->key.objectid + block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, + ret = find_first_extent_bit(unpin, start, + &extent_start, &extent_end, EXTENT_DIRTY); if (ret) { ret = 0; @@ -928,20 +934,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, } /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + + if (extent_start >= block_group->key.objectid + block_group->key.offset) break; - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); + extent_start = max(extent_start, start); + extent_end = min(block_group->key.objectid + + block_group->key.offset, extent_end + 1); + len = extent_end - extent_start; entries++; - ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); if (ret) goto out_nospc; - start = end + 1; + start = extent_end; } /* Write out the bitmaps */ @@ -2236,7 +2243,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, - min_start); + cluster->window_start); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) @@ -2245,6 +2252,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } + cluster->window_start += bytes; } else { ret = entry->offset; @@ -2283,23 +2291,23 @@ out: static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 min_bytes) + u64 offset, u64 bytes, + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; - unsigned long search_bits; - unsigned long total_bits; + unsigned long want_bits; + unsigned long min_bits; unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; int ret; - bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(bytes, block_group->sectorsize); - total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + want_bits = bytes_to_bits(bytes, block_group->sectorsize); + min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -2308,7 +2316,7 @@ again: i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); - if (next_zero - i >= search_bits) { + if (next_zero - i >= min_bits) { found_bits = next_zero - i; break; } @@ -2318,10 +2326,9 @@ again: if (!found_bits) return -ENOSPC; - if (!found) { + if (!total_found) { start = i; cluster->max_size = 0; - found = true; } total_found += found_bits; @@ -2329,13 +2336,8 @@ again: if (cluster->max_size < found_bits * block_group->sectorsize) cluster->max_size = found_bits * block_group->sectorsize; - if (total_found < total_bits) { - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); - if (i - start > total_bits * 2) { - total_found = 0; - cluster->max_size = 0; - found = false; - } + if (total_found < want_bits || cluster->max_size < cont1_bytes) { + i = next_zero + 1; goto again; } @@ -2346,28 +2348,31 @@ again: &entry->offset_index, 1); BUG_ON(ret); + trace_btrfs_setup_cluster(block_group, cluster, + total_found * block_group->sectorsize, 1); return 0; } /* * This searches the block group for just extents to fill the cluster with. + * Try to find a cluster with at least bytes total bytes, at least one + * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *prev = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_start; u64 window_free; u64 max_extent; - u64 max_gap = 128 * 1024; + u64 total_size = 0; entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) @@ -2377,8 +2382,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * We don't want bitmaps, so just move along until we find a normal * extent entry. */ - while (entry->bitmap) { - if (list_empty(&entry->list)) + while (entry->bitmap || entry->bytes < min_bytes) { + if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) @@ -2391,12 +2396,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, max_extent = entry->bytes; first = entry; last = entry; - prev = entry; - while (window_free <= min_bytes) { - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; + for (node = rb_next(&entry->offset_index); node; + node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { @@ -2405,26 +2407,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, continue; } - /* - * we haven't filled the empty size and the window is - * very large. reset and try again - */ - if (entry->offset - (prev->offset + prev->bytes) > max_gap || - entry->offset - window_start > (min_bytes * 2)) { - first = entry; - window_start = entry->offset; - window_free = entry->bytes; - last = entry; + if (entry->bytes < min_bytes) + continue; + + last = entry; + window_free += entry->bytes; + if (entry->bytes > max_extent) max_extent = entry->bytes; - } else { - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - prev = entry; } + if (window_free < bytes || max_extent < cont1_bytes) + return -ENOSPC; + cluster->window_start = first->offset; node = &first->offset_index; @@ -2438,17 +2432,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap) + if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); + total_size += entry->bytes; BUG_ON(ret); } while (node && entry != last); cluster->max_size = max_extent; - + trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } @@ -2460,7 +2455,7 @@ static noinline int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, - u64 min_bytes) + u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; @@ -2482,10 +2477,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } list_for_each_entry(entry, bitmaps, list) { - if (entry->bytes < min_bytes) + if (entry->bytes < bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); + bytes, cont1_bytes, min_bytes); if (!ret) return 0; } @@ -2499,7 +2494,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, /* * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. + * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise @@ -2515,23 +2510,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; + u64 cont1_bytes; int ret; - /* for metadata, allow allocates with more holes */ + /* + * Choose the minimum extent size we'll require for this + * cluster. For SSD_SPREAD, don't allow any fragmentation. + * For metadata, allow allocates with smaller extents. For + * data, keep it dense. + */ if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; + cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); + cont1_bytes = bytes; + min_bytes = block_group->sectorsize; + } else { + cont1_bytes = max(bytes, (bytes + empty_size) >> 2); + min_bytes = block_group->sectorsize; + } spin_lock(&ctl->tree_lock); @@ -2539,7 +2535,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ - if (ctl->free_space < min_bytes) { + if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } @@ -2552,11 +2548,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } + trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, + min_bytes); + + INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes, min_bytes); + bytes + empty_size, + cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes, min_bytes); + offset, bytes + empty_size, + cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) @@ -2567,6 +2569,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; + } else { + trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); @@ -2588,17 +2592,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +static int do_trimming(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 bytes, + u64 reserved_start, u64 reserved_bytes) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; + struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 bytes = 0; - u64 actually_trimmed; - int ret = 0; + int ret; + int update = 0; + u64 trimmed = 0; - *trimmed = 0; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += reserved_bytes; + space_info->bytes_reserved += reserved_bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); + if (!ret) + *total_trimmed += trimmed; + + btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += reserved_bytes; + block_group->reserved -= reserved_bytes; + space_info->bytes_reserved -= reserved_bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } + + return ret; +} + +static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = 0; + u64 extent_start; + u64 extent_bytes; + u64 bytes; while (start < end) { spin_lock(&ctl->tree_lock); @@ -2609,81 +2653,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, } entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) - entry = tree_search_offset(ctl, - offset_to_bitmap(ctl, start), - 1, 1); - - if (!entry || entry->offset >= end) { + if (!entry) { spin_unlock(&ctl->tree_lock); break; } - if (entry->bitmap) { - ret = search_bitmap(ctl, entry, &start, &bytes); - if (!ret) { - if (start >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - bytes = min(bytes, end - start); - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } else { - start = entry->offset + BITS_PER_BITMAP * - block_group->sectorsize; + /* skip bitmaps */ + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) { spin_unlock(&ctl->tree_lock); - ret = 0; - continue; + goto out; } - } else { - start = entry->offset; - bytes = min(entry->bytes, end - start); - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + } + + if (entry->offset >= end) { + spin_unlock(&ctl->tree_lock); + break; } + extent_start = entry->offset; + extent_bytes = entry->bytes; + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + spin_unlock(&ctl->tree_lock); - if (bytes >= minlen) { - struct btrfs_space_info *space_info; - int update = 0; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += bytes; - space_info->bytes_reserved += bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, - bytes, - &actually_trimmed); - - btrfs_add_free_space(block_group, start, bytes); - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += bytes; - block_group->reserved -= bytes; - space_info->bytes_reserved -= bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } + ret = do_trimming(block_group, total_trimmed, start, bytes, + extent_start, extent_bytes); + if (ret) + break; +next: + start += bytes; - if (ret) - break; - *trimmed += actually_trimmed; + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } +out: + return ret; +} + +static int trim_bitmaps(struct btrfs_block_group_cache *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; + int ret = 0; + int ret2; + u64 bytes; + u64 offset = offset_to_bitmap(ctl, start); + + while (offset < end) { + bool next_bitmap = false; + + spin_lock(&ctl->tree_lock); + + if (ctl->free_space < minlen) { + spin_unlock(&ctl->tree_lock); + break; + } + + entry = tree_search_offset(ctl, offset, 1, 0); + if (!entry) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = minlen; + ret2 = search_bitmap(ctl, entry, &start, &bytes); + if (ret2 || start >= end) { + spin_unlock(&ctl->tree_lock); + next_bitmap = true; + goto next; + } + + bytes = min(bytes, end - start); + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + goto next; + } + + bitmap_clear_bits(ctl, entry, start, bytes); + if (entry->bytes == 0) + free_bitmap(ctl, entry); + + spin_unlock(&ctl->tree_lock); + + ret = do_trimming(block_group, total_trimmed, start, bytes, + start, bytes); + if (ret) + break; +next: + if (next_bitmap) { + offset += BITS_PER_BITMAP * ctl->unit; + } else { + start += bytes; + if (start >= offset + BITS_PER_BITMAP * ctl->unit) + offset += BITS_PER_BITMAP * ctl->unit; } - start += bytes; - bytes = 0; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -2696,6 +2777,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, return ret; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + return ret; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index f8962a957d6..ee15d88b33d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,6 +438,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, + trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -498,6 +501,9 @@ again: out_put: iput(inode); out_release: + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, + trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: trans->block_rsv = rsv; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fd1a06df5bc..892b34785cc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1555,6 +1555,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; + int ret; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; @@ -1582,12 +1583,21 @@ again: page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); goto again; } - BUG(); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); + set_page_dirty(page); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -1630,7 +1640,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) fixup->work.func = btrfs_writepage_fixup_worker; fixup->page = page; btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EAGAIN; + return -EBUSY; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -1944,19 +1954,35 @@ enum btrfs_orphan_cleanup_state { }; /* - * This is called in transaction commmit time. If there are no orphan + * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv * structure. */ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_block_rsv *block_rsv; int ret; if (!list_empty(&root->orphan_list) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; + spin_lock(&root->orphan_lock); + if (!list_empty(&root->orphan_list)) { + spin_unlock(&root->orphan_lock); + return; + } + + if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { + spin_unlock(&root->orphan_lock); + return; + } + + block_rsv = root->orphan_block_rsv; + root->orphan_block_rsv = NULL; + spin_unlock(&root->orphan_lock); + if (root->orphan_item_inserted && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, @@ -1965,10 +1991,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_item_inserted = 0; } - if (root->orphan_block_rsv) { - WARN_ON(root->orphan_block_rsv->size > 0); - btrfs_free_block_rsv(root, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; + if (block_rsv) { + WARN_ON(block_rsv->size > 0); + btrfs_free_block_rsv(root, block_rsv); } } @@ -2224,14 +2249,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; - /* - * Need to hold the imutex for reservation purposes, not - * a huge deal here but I have a WARN_ON in - * btrfs_delalloc_reserve_space to catch offenders. - */ - mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); - mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -2845,7 +2863,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3009,7 +3027,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; - int encoding; int ret; int err = 0; u64 ino = btrfs_ino(inode); @@ -3059,7 +3076,6 @@ search_again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - encoding = 0; if (found_key.objectid != ino) break; @@ -3072,10 +3088,6 @@ search_again: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - encoding = btrfs_file_extent_compression(leaf, fi); - encoding |= btrfs_file_extent_encryption(leaf, fi); - encoding |= btrfs_file_extent_other_encoding(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); @@ -3103,7 +3115,7 @@ search_again: if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item && !encoding) { + if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = new_size - @@ -3179,7 +3191,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset); + ino, extent_offset, 0); BUG_ON(ret); } @@ -3434,7 +3446,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); } else { /* @@ -4412,8 +4424,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, int mode, - u64 *index) + u64 ref_objectid, u64 objectid, + umode_t mode, u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4573,7 +4585,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - BUG_ON(ret); + if (ret) + goto fail_dir_item; btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); @@ -4581,6 +4594,23 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, parent_inode); } return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); + } + return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, @@ -4596,7 +4626,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, } static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4655,7 +4685,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); @@ -4665,7 +4695,7 @@ out_unlock: } static int btrfs_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -4723,7 +4753,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } out_unlock: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4782,7 +4812,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -4792,7 +4822,7 @@ fail: return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -4848,7 +4878,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -5121,7 +5151,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { - WARN_ON(1); + BUG(); if (!trans) { kunmap(page); free_extent_map(em); @@ -6399,21 +6429,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long zero_start; loff_t size; int ret; + int reserved = 0; u64 page_start; u64 page_end; - /* Need this to keep space reservations serialized */ - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); - if (!ret) + if (!ret) { ret = btrfs_update_time(vma->vm_file); + reserved = 1; + } if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - goto out; + if (reserved) + goto out; + goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -6494,8 +6526,9 @@ out_unlock: if (!ret) return VM_FAULT_LOCKED; unlock_page(page); - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: return ret; } @@ -6668,7 +6701,7 @@ end_trans: err = ret; nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); + ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); } @@ -6691,8 +6724,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int err; u64 index = 0; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, S_IFDIR | 0700, &index); + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; @@ -6749,6 +6784,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(&ei->io_tree, &inode->i_data); extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); mutex_init(&ei->log_mutex); + mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -6761,7 +6797,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) static void btrfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -7075,7 +7110,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); } out_fail: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); @@ -7247,7 +7282,7 @@ out_unlock: if (!err) d_instantiate(dentry, inode); nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5b..d8b54715c2d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_trans_handle *trans; unsigned int flags, oldflags; int ret; + u64 ip_oldflags; + unsigned int i_oldflags; if (btrfs_root_readonly(root)) return -EROFS; @@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) mutex_lock(&inode->i_mutex); + ip_oldflags = ip->flags; + i_oldflags = inode->i_flags; + flags = btrfs_mask_flags(inode->i_mode, flags); oldflags = btrfs_flags_to_ioctl(ip->flags); if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -201,7 +206,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out_unlock; @@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop; + } btrfs_update_iflags(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); btrfs_end_transaction(trans, root); + out_drop: + if (ret) { + ip->flags = ip_oldflags; + inode->i_flags = i_oldflags; + } - mnt_drop_write(file->f_path.mnt); - - ret = 0; + mnt_drop_write_file(file); out_unlock: mutex_unlock(&inode->i_mutex); return ret; @@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(root, &range); + ret = btrfs_trim_fs(fs_info->tree_root, &range); if (ret < 0) return ret; @@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -852,30 +861,45 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); - mutex_unlock(&inode->i_mutex); if (ret) return ret; -again: - ret = 0; i_done = 0; + tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; +again: page = find_or_create_page(inode->i_mapping, - start_index + i, mask); + start_index + i, mask); if (!page) break; + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent(tree, page_start, page_end, GFP_NOFS); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -886,15 +910,22 @@ again: break; } } + isize = i_size_read(inode); file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end || - page->mapping != inode->i_mapping) { + if (!isize || page->index > file_end) { /* whoops, we blew past eof, skip this page */ unlock_page(page); page_cache_release(page); break; } + + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + pages[i] = page; i_done++; } @@ -917,25 +948,6 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); - if (ordered && - ordered->file_offset + ordered->len > page_start && - ordered->file_offset < page_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, - &cached_state, GFP_NOFS); - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_wait_ordered_range(inode, page_start, - page_end - page_start); - goto again; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, @@ -1058,7 +1070,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index; + max_to_defrag = last_index + 1; /* * make writeback starts from i, so the defrag range can be @@ -1203,13 +1215,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1226,7 +1246,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_unlock; + goto out_free; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1241,7 +1261,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } @@ -1250,7 +1270,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1259,11 +1279,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); @@ -1276,7 +1296,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_unlock; + goto out_free; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1284,9 +1304,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -1311,6 +1332,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, goto out; } + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out; + } + if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly); @@ -1855,7 +1882,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; @@ -1971,7 +1998,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: kfree(vol_args); return err; @@ -1987,7 +2014,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2040,7 +2067,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2052,14 +2079,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2074,14 +2112,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2195,7 +2244,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) return ret; @@ -2427,7 +2476,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao); + new_key.offset - datao, + 0); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -2510,7 +2560,7 @@ out_unlock: out_fput: fput(src_file); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return ret; } @@ -2549,7 +2599,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (btrfs_root_readonly(root)) goto out; - ret = mnt_want_write(file->f_path.mnt); + ret = mnt_want_write_file(file); if (ret) goto out; @@ -2565,7 +2615,7 @@ static long btrfs_ioctl_trans_start(struct file *file) out_drop: atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: return ret; } @@ -2800,7 +2850,7 @@ long btrfs_ioctl_trans_end(struct file *file) atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return 0; } @@ -2977,7 +3027,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, { int ret = 0; int size; - u64 extent_offset; + u64 extent_item_pos; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; @@ -3008,15 +3058,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, } ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = -ENOENT; if (ret < 0) goto out; - extent_offset = loi->logical - key.objectid; + extent_item_pos = loi->logical - key.objectid; ret = iterate_extent_inodes(root->fs_info, path, key.objectid, - extent_offset, build_ino_list, inodes); + extent_item_pos, build_ino_list, + inodes); if (ret < 0) goto out; @@ -3034,6 +3086,163 @@ out: return ret; } +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } +} + +static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out; + } + + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out_bargs; + } + + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + + goto do_balance; + } + } else { + bargs = NULL; + } + + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + } + +do_balance: + ret = btrfs_balance(bctl, bargs); + /* + * bctl is freed in __cancel_balance or in free_fs_info if + * restriper was paused all the way until unmount + */ + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); + } + + return -EINVAL; +} + +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3078,7 +3287,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3110,6 +3319,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(root, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de..4f69028a68c 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 + +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + __u64 usage; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + __u64 unused[8]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d77b67c4b27..5e178d8f716 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (&eb->lock_nested && current->pid == eb->lock_owner) { + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. We allow + * an additional read lock to be added because it's for the same + * thread. btrfs_find_all_roots() depends on this as it may be + * called on a partly (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = 1; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); @@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } atomic_inc(&eb->write_locks); atomic_inc(&eb->spinning_writers); + eb->lock_owner = current->pid; return 1; } @@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); atomic_dec(&eb->spinning_readers); @@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { + if (eb->lock_nested) { + read_lock(&eb->lock); + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + read_unlock(&eb->lock); + return; + } + read_unlock(&eb->lock); + } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); if (atomic_dec_and_test(&eb->blocking_readers)) @@ -181,6 +229,7 @@ again: WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); atomic_inc(&eb->write_locks); + eb->lock_owner = current->pid; return 0; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 2373b39a132..22db04550f6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -305,7 +305,7 @@ again: spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); spin_unlock(&fs_info->reada_lock); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfb55434a46..8c1aae2c845 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, 1); BUG_ON(ret); } if (dirty) @@ -1778,21 +1778,23 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + src->root_key.objectid, level - 1, 0, + 1); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0); + 0, 1); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2244,7 +2246,7 @@ again: } else { list_del_init(&reloc_root->root_list); } - btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); } if (found) { @@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0); + node->level, 0, 1); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { - mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); - mutex_unlock(&inode->i_mutex); if (ret) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90d3fc..abc0fbffa51 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "check-integrity.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, u8 ref_level; unsigned long ptr = 0; const int bufsize = 4096; - u64 extent_offset; + u64 extent_item_pos; path = btrfs_alloc_path(); @@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, if (ret < 0) goto out; - extent_offset = swarn.logical - found_key.objectid; + extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); + btrfs_release_path(path); if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, } else { swarn.path = path; iterate_extent_inodes(fs_info, path, found_key.objectid, - extent_offset, + extent_item_pos, scrub_print_warning_inode, &swarn); } @@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; bio->bi_private = &complete; - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); /* this will also unplug the queue */ wait_for_completion(&complete); @@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev) sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, sbio->bio); + btrfsic_submit_bio(READ, sbio->bio); return 0; } @@ -1365,7 +1367,8 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) + u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = &sdev->dev->dev_root->fs_info->mapping_tree; @@ -1389,7 +1392,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev) { + if (map->stripes[i].dev == sdev->dev && + map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sdev, map, i, chunk_offset, length); if (ret) goto out; @@ -1485,7 +1489,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, - chunk_offset, length); + chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) break; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 200f63bc667..3ce97b217cb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,7 +40,6 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/cleancache.h> -#include <linux/mnt_namespace.h> #include <linux/ratelimit.h> #include "compat.h" #include "delayed-inode.h" @@ -148,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - int ret; - - ret = close_ctree(root); - sb->s_fs_info = NULL; - - (void)ret; /* FIXME: need to fix VFS to return error? */ + (void)close_ctree(btrfs_sb(sb)->tree_root); + /* FIXME: need to fix VFS to return error? */ + /* AV: return it _where_? ->put_super() can be triggered by any number + * of async events, up to and including delivery of SIGKILL to the + * last process that kept it busy. Or segfault in the aforementioned + * process... Whom would you report that to? + */ } enum { @@ -164,8 +163,11 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +202,10 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, {Opt_err, NULL}, }; @@ -398,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + printk(KERN_INFO "btrfs: enabling check integrity" + " including extent data\n"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->check_integrity_print_mask = intarg; + printk(KERN_INFO "btrfs:" + " check_integrity_print_mask 0x%x\n", + info->check_integrity_print_mask); + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + printk(KERN_ERR "btrfs: support for check_integrity*" + " not compiled in!\n"); + ret = -EINVAL; + goto out; +#endif case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -501,7 +541,8 @@ out: static struct dentry *get_default_root(struct super_block *sb, u64 subvol_objectid) { - struct btrfs_root *root = sb->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_path *path; @@ -531,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -545,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb, */ btrfs_free_path(path); dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = root->fs_info->fs_root; + new_root = fs_info->fs_root; goto setup_root; } @@ -553,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb, btrfs_free_path(path); find_root: - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + new_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(new_root)) return ERR_CAST(new_root); @@ -589,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_root *tree_root; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; @@ -604,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - tree_root = open_ctree(sb, fs_devices, (char *)data); - - if (IS_ERR(tree_root)) { + err = open_ctree(sb, fs_devices, (char *)data); + if (err) { printk("btrfs: open_ctree failed\n"); - return PTR_ERR(tree_root); + return err; } - sb->s_fs_info = tree_root; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -632,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb, save_mount_options(sb, data); cleancache_init_fs(sb); + sb->s_flags |= MS_ACTIVE; return 0; fail_close: - close_ctree(tree_root); + close_ctree(fs_info->tree_root); return err; } int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; trace_btrfs_sync_fs(wait); if (!wait) { - filemap_flush(root->fs_info->btree_inode->i_mapping); + filemap_flush(fs_info->btree_inode->i_mapping); return 0; } @@ -662,10 +703,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); - struct btrfs_fs_info *info = root->fs_info; + struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); + struct btrfs_root *root = info->tree_root; char *compress_type; if (btrfs_test_opt(root, DEGRADED)) @@ -723,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); return 0; } static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_root *test_root = data; - struct btrfs_root *root = btrfs_sb(s); + struct btrfs_fs_info *p = data; + struct btrfs_fs_info *fs_info = btrfs_sb(s); - /* - * If this super block is going away, return false as it - * can't match as an existing super block. - */ - if (!atomic_read(&s->s_active)) - return 0; - return root->fs_info->fs_devices == test_root->fs_info->fs_devices; + return fs_info->fs_devices == p->fs_devices; } static int btrfs_set_super(struct super_block *s, void *data) { - s->s_fs_info = data; - - return set_anon_super(s, data); + int err = set_anon_super(s, data); + if (!err) + s->s_fs_info = data; + return err; } /* @@ -904,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (!fs_info) return ERR_PTR(-ENOMEM); - fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info->tree_root) { - error = -ENOMEM; - goto error_fs_info; - } - fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); @@ -929,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, - fs_info->tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; } if (s->s_root) { - if ((flags ^ s->s_flags) & MS_RDONLY) { - deactivate_locked_super(s); - error = -EBUSY; - goto error_close_devices; - } - btrfs_close_devices(fs_devices); free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->fs_info->bdev_holder = fs_type; + btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { + root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); + if (IS_ERR(root)) deactivate_locked_super(s); - return root; - } return root; @@ -978,7 +997,8 @@ error_fs_info: static int btrfs_remount(struct super_block *sb, int *flags, char *data) { - struct btrfs_root *root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; int ret; ret = btrfs_parse_options(root, data); @@ -994,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = btrfs_commit_super(root); WARN_ON(ret); } else { - if (root->fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_fs_roots(root->fs_info); + ret = btrfs_cleanup_fs_roots(fs_info); WARN_ON(ret); /* recover relocation */ @@ -1169,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = root->fs_info->super_copy; - struct list_head *head = &root->fs_info->space_info; + struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = fs_info->super_copy; + struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)root->fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fsid; int ret; /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1199,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(root, &total_free_data); + ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } buf->f_bavail += total_free_data; buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -1220,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static void btrfs_kill_super(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + kill_anon_super(sb); + free_fs_info(fs_info); +} + static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", .mount = btrfs_mount, - .kill_sb = kill_anon_super, + .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1258,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->cleaner_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_lock(&fs_info->transaction_kthread_mutex); + mutex_lock(&fs_info->cleaner_mutex); return 0; } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb); - mutex_unlock(&root->fs_info->cleaner_mutex); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + mutex_unlock(&fs_info->cleaner_mutex); + mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d94cd3..04b77e3ceb7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); + WARN_ON(transaction->delayed_refs.root.rb_node); + WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -108,8 +110,11 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.seq = 1; + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &root->fs_info->trans_list); @@ -321,6 +326,9 @@ again: } if (num_bytes) { + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)(unsigned long)h, + num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } @@ -467,19 +475,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - while (count < 4) { + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && trans->transaction->delayed_refs.num_heads_ready > 64) { trans->delayed_ref_updates = 0; - - /* - * do a full flush if the transaction is trying - * to close - */ - if (trans->transaction->delayed_refs.flushing) - cur = 0; btrfs_run_delayed_refs(trans, root, cur); } else { break; @@ -915,7 +916,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - BUG_ON(ret); + if (ret) { + pending->error = -EEXIST; + dput(parent); + goto fail; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); @@ -993,12 +998,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - int ret; - list_for_each_entry(pending, head, list) { - ret = create_pending_snapshot(trans, fs_info, pending); - BUG_ON(ret); - } + list_for_each_entry(pending, head, list) + create_pending_snapshot(trans, fs_info, pending); return 0; } @@ -1393,9 +1395,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, NULL, 0); + btrfs_drop_snapshot(root, NULL, 0, 0); else - btrfs_drop_snapshot(root, NULL, 1); + btrfs_drop_snapshot(root, NULL, 1, 0); } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3568374d419..966cc74f5d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0); BUG_ON(ret); } else { /* @@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->log_transid < transid + 2 && + } while (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return 0; } @@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { + while (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c new file mode 100644 index 00000000000..12f5147bd2b --- /dev/null +++ b/fs/btrfs/ulist.c @@ -0,0 +1,220 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include "ulist.h" + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + * + * A sample usage for ulists is the enumeration of directed graphs without + * visiting a node twice. The pseudo-code could look like this: + * + * ulist = ulist_alloc(); + * ulist_add(ulist, root); + * elem = NULL; + * + * while ((elem = ulist_next(ulist, elem)) { + * for (all child nodes n in elem) + * ulist_add(ulist, n); + * do something useful with the node; + * } + * ulist_free(ulist); + * + * This assumes the graph nodes are adressable by u64. This stems from the + * usage for tree enumeration in btrfs, where the logical addresses are + * 64 bit. + * + * It is also useful for tree enumeration which could be done elegantly + * recursively, but is not possible due to kernel stack limitations. The + * loop would be similar to the above. + */ + +/** + * ulist_init - freshly initialize a ulist + * @ulist: the ulist to initialize + * + * Note: don't use this function to init an already used ulist, use + * ulist_reinit instead. + */ +void ulist_init(struct ulist *ulist) +{ + ulist->nnodes = 0; + ulist->nodes = ulist->int_nodes; + ulist->nodes_alloced = ULIST_SIZE; +} +EXPORT_SYMBOL(ulist_init); + +/** + * ulist_fini - free up additionally allocated memory for the ulist + * @ulist: the ulist from which to free the additional memory + * + * This is useful in cases where the base 'struct ulist' has been statically + * allocated. + */ +void ulist_fini(struct ulist *ulist) +{ + /* + * The first ULIST_SIZE elements are stored inline in struct ulist. + * Only if more elements are alocated they need to be freed. + */ + if (ulist->nodes_alloced > ULIST_SIZE) + kfree(ulist->nodes); + ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ +} +EXPORT_SYMBOL(ulist_fini); + +/** + * ulist_reinit - prepare a ulist for reuse + * @ulist: ulist to be reused + * + * Free up all additional memory allocated for the list elements and reinit + * the ulist. + */ +void ulist_reinit(struct ulist *ulist) +{ + ulist_fini(ulist); + ulist_init(ulist); +} +EXPORT_SYMBOL(ulist_reinit); + +/** + * ulist_alloc - dynamically allocate a ulist + * @gfp_mask: allocation flags to for base allocation + * + * The allocated ulist will be returned in an initialized state. + */ +struct ulist *ulist_alloc(unsigned long gfp_mask) +{ + struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); + + if (!ulist) + return NULL; + + ulist_init(ulist); + + return ulist; +} +EXPORT_SYMBOL(ulist_alloc); + +/** + * ulist_free - free dynamically allocated ulist + * @ulist: ulist to free + * + * It is not necessary to call ulist_fini before. + */ +void ulist_free(struct ulist *ulist) +{ + if (!ulist) + return; + ulist_fini(ulist); + kfree(ulist); +} +EXPORT_SYMBOL(ulist_free); + +/** + * ulist_add - add an element to the ulist + * @ulist: ulist to add the element to + * @val: value to add to ulist + * @aux: auxiliary value to store along with val + * @gfp_mask: flags to use for allocation + * + * Note: locking must be provided by the caller. In case of rwlocks write + * locking is needed + * + * Add an element to a ulist. The @val will only be added if it doesn't + * already exist. If it is added, the auxiliary value @aux is stored along with + * it. In case @val already exists in the ulist, @aux is ignored, even if + * it differs from the already stored value. + * + * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been + * inserted. + * In case of allocation failure -ENOMEM is returned and the ulist stays + * unaltered. + */ +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask) +{ + int i; + + for (i = 0; i < ulist->nnodes; ++i) { + if (ulist->nodes[i].val == val) + return 0; + } + + if (ulist->nnodes >= ulist->nodes_alloced) { + u64 new_alloced = ulist->nodes_alloced + 128; + struct ulist_node *new_nodes; + void *old = NULL; + + /* + * if nodes_alloced == ULIST_SIZE no memory has been allocated + * yet, so pass NULL to krealloc + */ + if (ulist->nodes_alloced > ULIST_SIZE) + old = ulist->nodes; + + new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, + gfp_mask); + if (!new_nodes) + return -ENOMEM; + + if (!old) + memcpy(new_nodes, ulist->int_nodes, + sizeof(ulist->int_nodes)); + + ulist->nodes = new_nodes; + ulist->nodes_alloced = new_alloced; + } + ulist->nodes[ulist->nnodes].val = val; + ulist->nodes[ulist->nnodes].aux = aux; + ++ulist->nnodes; + + return 1; +} +EXPORT_SYMBOL(ulist_add); + +/** + * ulist_next - iterate ulist + * @ulist: ulist to iterate + * @prev: previously returned element or %NULL to start iteration + * + * Note: locking must be provided by the caller. In case of rwlocks only read + * locking is needed + * + * This function is used to iterate an ulist. The iteration is started with + * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * end is reached. No guarantee is made with respect to the order in which + * the elements are returned. They might neither be returned in order of + * addition nor in ascending order. + * It is allowed to call ulist_add during an enumeration. Newly added items + * are guaranteed to show up in the running enumeration. + */ +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +{ + int next; + + if (ulist->nnodes == 0) + return NULL; + + if (!prev) + return &ulist->nodes[0]; + + next = (prev - ulist->nodes) + 1; + if (next < 0 || next >= ulist->nnodes) + return NULL; + + return &ulist->nodes[next]; +} +EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h new file mode 100644 index 00000000000..2e25dec58ec --- /dev/null +++ b/fs/btrfs/ulist.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2011 STRATO AG + * written by Arne Jansen <sensille@gmx.net> + * Distributed under the GNU GPL license version 2. + * + */ + +#ifndef __ULIST__ +#define __ULIST__ + +/* + * ulist is a generic data structure to hold a collection of unique u64 + * values. The only operations it supports is adding to the list and + * enumerating it. + * It is possible to store an auxiliary value along with the key. + * + * The implementation is preliminary and can probably be sped up + * significantly. A first step would be to store the values in an rbtree + * as soon as ULIST_SIZE is exceeded. + */ + +/* + * number of elements statically allocated inside struct ulist + */ +#define ULIST_SIZE 16 + +/* + * element of the list + */ +struct ulist_node { + u64 val; /* value to store */ + unsigned long aux; /* auxiliary value saved along with the val */ +}; + +struct ulist { + /* + * number of elements stored in list + */ + unsigned long nnodes; + + /* + * number of nodes we already have room for + */ + unsigned long nodes_alloced; + + /* + * pointer to the array storing the elements. The first ULIST_SIZE + * elements are stored inline. In this case the it points to int_nodes. + * After exceeding ULIST_SIZE, dynamic memory is allocated. + */ + struct ulist_node *nodes; + + /* + * inline storage space for the first ULIST_SIZE entries + */ + struct ulist_node int_nodes[ULIST_SIZE]; +}; + +void ulist_init(struct ulist *ulist); +void ulist_fini(struct ulist *ulist); +void ulist_reinit(struct ulist *ulist); +struct ulist *ulist_alloc(unsigned long gfp_mask); +void ulist_free(struct ulist *ulist); +int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9..ef41f285a47 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/kthread.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -32,6 +33,7 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" +#include "check-integrity.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -246,7 +248,7 @@ loop_lock: sync_pending = 0; } - submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; if (need_resched()) @@ -457,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!latest_transid || + device->generation > latest_transid) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } if (device->bdev) { blkdev_put(device->bdev, device->mode); @@ -485,6 +498,10 @@ again: goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); return 0; } @@ -706,8 +723,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 devid; u64 transid; - mutex_lock(&uuid_mutex); - flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -716,6 +731,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } + mutex_lock(&uuid_mutex); ret = set_blocksize(bdev, 4096); if (ret) goto error_close; @@ -737,9 +753,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: + mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: - mutex_unlock(&uuid_mutex); return ret; } @@ -829,7 +845,6 @@ out: /* * find_free_dev_extent - find free space in the specified device - * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,8 +863,7 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -893,7 +907,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1282,7 +1296,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1452,7 +1465,6 @@ error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1469,8 +1481,7 @@ error_undo: /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1629,7 +1640,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1695,7 +1705,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); + ret = btrfs_prepare_sprout(root); BUG_ON(ret); } @@ -1757,8 +1767,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1766,7 +1775,7 @@ error: mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -1959,7 +1968,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; @@ -2077,6 +2086,362 @@ error: return ret; } +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->profiles & chunk_profile) + return 0; + + return 1; +} + +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor <= 0) + return 0; + if (factor >= 100) + return num; + + num *= factor; + do_div(num, 100); + return num; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + factor = num_stripes / factor; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + +static int chunk_soft_convert_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->target & chunk_profile) + return 1; + + return 0; +} + +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + + return 1; +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2086,29 +2451,28 @@ static u64 div_factor(u64 num, int factor) return num; } -int btrfs_balance(struct btrfs_root *dev_root) +static int __btrfs_balance(struct btrfs_fs_info *fs_info) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; + int ret; + int enospc_errors = 0; + bool counting = true; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2137,11 +2501,23 @@ int btrfs_balance(struct btrfs_root *dev_root) ret = -ENOMEM; goto error; } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2151,15 +2527,19 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid) break; @@ -2167,22 +2547,375 @@ int btrfs_balance(struct btrfs_root *dev_root) if (found_key.offset == 0) break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); btrfs_release_path(path); + if (!ret) + goto loop; + + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; + if (ret == -ENOSPC) { + enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } +loop: key.offset = found_key.offset - 1; } - ret = 0; + + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + return ret; } +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + int ret; + + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + BUG_ON(ret); +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; + int ret; + + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { + ret = -EINVAL; + goto out; + } + + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + printk(KERN_ERR "btrfs: with mixed groups data and " + "metadata balance options must be the same\n"); + ret = -EINVAL; + goto out; + } + } + + /* + * Profile changing sanity checks. Skip them if a simple + * balance is requested. + */ + if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & + BTRFS_BALANCE_ARGS_CONVERT)) + goto do_balance; + + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (fs_info->fs_devices->num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (fs_info->fs_devices->num_devices < 4) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + else + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10); + + if (!profile_is_valid(bctl->data.target, 1) || + bctl->data.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "data profile %llu\n", + (unsigned long long)bctl->data.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->meta.target, 1) || + bctl->meta.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "metadata profile %llu\n", + (unsigned long long)bctl->meta.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->sys.target, 1) || + bctl->sys.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "system profile %llu\n", + (unsigned long long)bctl->sys.target); + ret = -EINVAL; + goto out; + } + + if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } + } + +do_balance: + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret && ret != -EEXIST) + goto out; + + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } + + atomic_inc(&fs_info->balance_running); + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); + + return ret; +out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else + kfree(bctl); + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_balance_control *bctl = + (struct btrfs_balance_control *)data; + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret = 0; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + } else { + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +int btrfs_recover_balance(struct btrfs_root *tree_root) +{ + struct task_struct *tsk; + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + goto out_bctl; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out_bctl; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = tree_root->fs_info; + bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); + if (IS_ERR(tsk)) + ret = PTR_ERR(tsk); + else + goto out; + +out_bctl: + kfree(bctl); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -2323,8 +3056,7 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -2441,10 +3173,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 8 * 1024 * 1024; + max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", @@ -2496,7 +3232,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -2687,7 +3423,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); BUG_ON(ret); } @@ -2752,8 +3488,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); + fs_info->avail_metadata_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -2763,8 +3498,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); + fs_info->avail_system_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, @@ -2901,26 +3635,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; - int stripes_allocated = 8; - int stripes_required = 1; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; - if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) - stripes_allocated = 1; -again: - if (bbio_ret) { - bbio = kzalloc(btrfs_bio_size(stripes_allocated), - GFP_NOFS); - if (!bbio) - return -ENOMEM; - - atomic_set(&bbio->error, 0); - } - read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -2939,32 +3660,6 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our btrfs_bio struct is too small, back off and try again */ - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { - stripes_required = map->num_stripes; - } - } - if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(bbio); - goto again; - } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -2980,10 +3675,7 @@ again: if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); @@ -3059,81 +3751,55 @@ again: } BUG_ON(stripe_index >= map->num_stripes); + bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + } + for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - u64 stripes; - u32 last_stripe = 0; - int j; - - div_u64_rem(stripe_nr_end - 1, - map->num_stripes, - &last_stripe); - - for (j = 0; j < map->num_stripes; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - map->num_stripes, &test); - if (test == stripe_index) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, map->num_stripes); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i == 0) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - stripe_offset = 0; - } - if (stripe_index == last_stripe) + if ((i / sub_stripes + 1) % + sub_stripes == remaining_stripes) bbio->stripes[i].length -= stripe_end_offset; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u64 stripes; - int j; - int factor = map->num_stripes / - map->sub_stripes; - u32 last_stripe = 0; - - div_u64_rem(stripe_nr_end - 1, - factor, &last_stripe); - last_stripe *= map->sub_stripes; - - for (j = 0; j < factor; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - factor, &test); - - if (test == - stripe_index / map->sub_stripes) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, factor); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i < map->sub_stripes) { - bbio->stripes[i].length -= - stripe_offset; - if (i == map->sub_stripes - 1) - stripe_offset = 0; - } - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - map->sub_stripes - 1)) { - bbio->stripes[i].length -= - stripe_end_offset; - } + if (i == sub_stripes - 1) + stripe_offset = 0; } else bbio->stripes[i].length = *length; @@ -3155,15 +3821,22 @@ again: stripe_index++; } } - if (bbio_ret) { - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; out: free_extent_map(em); - return 0; + return ret; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, @@ -3304,7 +3977,7 @@ static noinline int schedule_bio(struct btrfs_root *root, /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); bio_put(bio); return 0; } @@ -3399,7 +4072,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (async_submit) schedule_bio(root, dev, rw, bio); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; @@ -3568,7 +4241,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3606,7 +4279,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3699,6 +4371,20 @@ int btrfs_read_sys_array(struct btrfs_root *root) return -ENOMEM; btrfs_set_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -3749,6 +4435,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&uuid_mutex); + lock_chunks(root); + /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -3799,6 +4488,9 @@ again: } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37..19ac95048b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,51 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + +struct btrfs_balance_args; +struct btrfs_balance_progress; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; + + struct btrfs_balance_progress stat; +}; + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); +int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3848b04e310..e7a5659087e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction(trans, root); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 19d8eb7fdc8..1a30db77af3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,7 +41,6 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> -#include <linux/cleancache.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -231,55 +230,6 @@ out: return ret; } -/* If invalidate_buffers() will trash dirty buffers, it means some kind - of fs corruption is going on. Trashing dirty data always imply losing - information that was supposed to be just stored on the physical layer - by the user. - - Thus invalidate_buffers in general usage is not allwowed to trash - dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to - be preserved. These buffers are simply skipped. - - We also skip buffers which are still in use. For example this can - happen if a userspace program is reading the block device. - - NOTE: In the case where the user removed a removable-media-disk even if - there's still dirty data not synced on disk (due a bug in the device driver - or due an error of the user), by not destroying the dirty buffers we could - generate corruption also on the next media inserted, thus a parameter is - necessary to handle this case in the most safe way possible (trying - to not corrupt also the new disk inserted with the data belonging to - the old now corrupted disk). Also for the ramdisk the natural thing - to do in order to release the ramdisk memory is to destroy dirty buffers. - - These are two special cases. Normal usage imply the device driver - to issue a sync on the device (without waiting I/O completion) and - then an invalidate_buffers call that doesn't trash dirty buffers. - - For handling cache coherency with the blkdev pagecache the 'update' case - is been introduced. It is needed to re-read from disk any pinned - buffer. NOTE: re-reading from disk is destructive so we can do it only - when we assume nobody is changing the buffercache under our I/O and when - we think the disk contains more recent information than the buffercache. - The update == 1 pass marks the buffers we need to update, the update == 2 - pass does the actual I/O. */ -void invalidate_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_flush_inode(mapping); -} -EXPORT_SYMBOL(invalidate_bdev); - /* * Kick the writeback threads then try to free up some ZONE_NORMAL memory. */ diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 1064805e653..67bef6d0148 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/mount.h> -#include <linux/buffer_head.h> #include "internal.h" #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8b53193e4f7..620daad201d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_cap_lock); + spin_lock(&cap->session->s_gen_ttl_lock); gen = cap->session->s_cap_gen; ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_cap_lock); + spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " @@ -928,7 +928,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, mode_t mode, + uid_t uid, gid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -1078,7 +1078,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 size, max_size; struct timespec mtime, atime; int wake = 0; - mode_t mode; + umode_t mode; uid_t uid; gid_t gid; struct ceph_mds_session *session; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 98954003a8d..3e8094be460 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -666,7 +666,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) } static int ceph_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -676,7 +676,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -699,7 +699,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { dout("create in dir %p dentry %p name '%.*s'\n", @@ -753,7 +753,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -767,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) dout("mksnap dir %p snap '%.*s' dn %p\n", dir, dentry->d_name.len, dentry->d_name.name, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); + dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { goto out; @@ -870,7 +870,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("unlink/rmdir dir %p dn %p inode %p\n", dir, dentry, inode); - op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? + op = S_ISDIR(dentry->d_inode->i_mode) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; @@ -973,12 +973,12 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { + if (di->lease_session) { s = di->lease_session; - spin_lock(&s->s_cap_lock); + spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; ttl = s->s_cap_ttl; - spin_unlock(&s->s_cap_lock); + spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, dentry->d_time) && @@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - if (di) { - ceph_dentry_lru_del(dentry); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); - kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; - } + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, @@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, */ void ceph_dir_set_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry) && + ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } void ceph_dir_clear_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) complete\n", inode, dentry); + set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); } bool ceph_dir_test_complete(struct inode *inode) { - /* not yet implemented */ + struct dentry *dentry = d_find_any_alias(inode); + + if (dentry && ceph_dentry(dentry)) { + dout(" marking %p (%p) NOT complete\n", inode, dentry); + clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + } + dput(dentry); return false; } @@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, do { ceph_mdsc_get_request(req); spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { @@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, } else { wait_for_completion(&req->r_safe_completion); } - spin_lock(&ci->i_unsafe_lock); ceph_mdsc_put_request(req); + spin_lock(&ci->i_unsafe_lock); if (ret || list_empty(head)) break; req = list_entry(head->next, @@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_touch(struct dentry *dn) @@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_del(struct dentry *dn) @@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); } /* diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9fbcdecaacc..fbb2a643ef1 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, return -EINVAL; spin_lock(&dentry->d_lock); - parent = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - + parent = dentry->d_parent; if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); @@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } - dput(parent); + spin_unlock(&dentry->d_lock); return type; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 87fb132fb33..2c489378b4c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -384,7 +384,6 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ceph_inode_cachep, ci); } @@ -851,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_info *ci; struct ceph_dentry_info *di; BUG_ON(!inode); + ci = ceph_inode(inode); di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6203d805eb4..866e8d7ca37 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; @@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; @@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; s->s_con.peer_name.num = cpu_to_le64(mds); - spin_lock_init(&s->s_cap_lock); + spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; s->s_cap_ttl = 0; + + spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); @@ -2326,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_cap_lock); + spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; session->s_cap_ttl = 0; - spin_unlock(&session->s_cap_lock); + spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; @@ -2772,7 +2776,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: - if (di && di->lease_session == session) { + if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); @@ -2781,7 +2785,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, break; case CEPH_MDS_LEASE_RENEW: - if (di && di->lease_session == session && + if (di->lease_session == session && di->lease_gen == session->s_cap_gen && di->lease_renew_from && di->lease_renew_after == 0) { diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a50ca0e3947..8c7c04ebb59 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -117,10 +117,13 @@ struct ceph_mds_session { void *s_authorizer_buf, *s_authorizer_reply_buf; size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; - /* protected by s_cap_lock */ - spinlock_t s_cap_lock; + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b48f15f101a..00de2c9568c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -131,6 +131,8 @@ enum { Opt_rbytes, Opt_norbytes, Opt_noasyncreaddir, + Opt_dcache, + Opt_nodcache, Opt_ino32, }; @@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = { {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_dcache, "dcache"}, + {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, {-1, NULL} }; @@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; + case Opt_dcache: + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_nodcache: + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; @@ -341,11 +351,11 @@ out: /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to - * @mnt: mount descriptor + * @root: root of that (sub)tree */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ceph_show_options(struct seq_file *m, struct dentry *root) { - struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; struct ceph_options *opt = fsc->client->options; @@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) + seq_puts(m, ",dcache"); + else + seq_puts(m, ",nodcache"); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -636,19 +650,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { + struct inode *inode = req->r_target_inode; + req->r_target_inode = NULL; dout("open_root_inode success\n"); - if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && + if (ceph_ino(inode) == CEPH_INO_ROOT && fsc->sb->s_root == NULL) { - root = d_alloc_root(req->r_target_inode); - ceph_init_dentry(root); + root = d_alloc_root(inode); + if (!root) { + iput(inode); + root = ERR_PTR(-ENOMEM); + goto out; + } } else { - root = d_obtain_alias(req->r_target_inode); + root = d_obtain_alias(inode); } - req->r_target_inode = NULL; + ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } +out: ceph_mdsc_put_request(req); return root; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index edcbf3774a5..1421f3d875a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -28,6 +28,7 @@ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -136,7 +137,7 @@ struct ceph_cap_snap { int issued, dirty; struct ceph_snap_context *context; - mode_t mode; + umode_t mode; uid_t uid; gid_t gid; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a5e36e4488a..a76f697303d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { + { true, "ceph.file.layout", ceph_vxattrcb_layout}, + /* The following extended attribute name is deprecated */ { true, "ceph.layout", ceph_vxattrcb_layout}, - { NULL, NULL } + { true, NULL, NULL } }; static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) @@ -818,6 +820,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; + int required_blob_size; int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -833,14 +836,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name) return -EOPNOTSUPP; } + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); __build_xattrs(inode); +retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + required_blob_size = __get_required_blob_size(ci, 0, 0); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } + err = __remove_xattr_by_name(ceph_inode(inode), name); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; @@ -853,6 +876,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) do_sync: spin_unlock(&ci->i_ceph_lock); err = ceph_send_removexattr(dentry, name); +out: return err; } diff --git a/fs/char_dev.c b/fs/char_dev.c index dca9e5e0f73..3f152b92a94 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); - + cdev = cdev_alloc(); if (!cdev) goto out2; @@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor, cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); - + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp) goto out_cdev_put; if (filp->f_op->open) { - ret = filp->f_op->open(inode,filp); + ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f66cc162515..2b243af70aa 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -139,8 +139,7 @@ config CIFS_DFS_UPCALL points. If unsure, say N. config CIFS_FSCACHE - bool "Provide CIFS client caching support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y help Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data @@ -148,8 +147,8 @@ config CIFS_FSCACHE manager. If unsure, say N. config CIFS_ACL - bool "Provide CIFS ACL support (EXPERIMENTAL)" - depends on EXPERIMENTAL && CIFS_XATTR && KEYS + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS help Allows to fetch CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 84e8c072470..24b3dfc0528 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file, { char c; int rc; + static bool warned; rc = get_user(c, buffer); if (rc) return rc; if (c == '0' || c == 'n' || c == 'N') multiuser_mount = 0; - else if (c == '1' || c == 'y' || c == 'Y') + else if (c == '1' || c == 'y' || c == 'Y') { multiuser_mount = 1; + if (!warned) { + warned = true; + printk(KERN_WARNING "CIFS VFS: The legacy multiuser " + "mount code is scheduled to be deprecated in " + "3.5. Please switch to using the multiuser " + "mount option."); + } + } return count; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 500d6585927..c865bfdfe81 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -59,8 +59,8 @@ struct cifs_sb_info { gid_t mnt_gid; uid_t mnt_backupuid; gid_t mnt_backupgid; - mode_t mnt_file_mode; - mode_t mnt_dir_mode; + umode_t mnt_file_mode; + umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; char *mountdata; /* options received at mount time or via DFS refs */ struct backing_dev_info bdi; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2272fd5fe5b..e622863b292 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + - USER_KEY_LEN + strlen(sesInfo->user_name) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; + if (sesInfo->user_name) + desc_len += USER_KEY_LEN + strlen(sesInfo->user_name); + spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); if (description == NULL) @@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) dp = description + strlen(description); sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); - dp = description + strlen(description); - sprintf(dp, ";user=%s", sesInfo->user_name); + if (sesInfo->user_name) { + dp = description + strlen(description); + sprintf(dp, ";user=%s", sesInfo->user_name); + } dp = description + strlen(description); sprintf(dp, ";pid=0x%x", current->pid); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 1b2e180b018..fbb9da95184 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -27,17 +27,17 @@ #include "cifs_debug.h" /* - * cifs_ucs2_bytes - how long will a string be after conversion? - * @ucs - pointer to input string + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string * @maxbytes - don't go past this many bytes of input string * @codepage - destination codepage * - * Walk a ucs2le string and return the number of bytes that the string will + * Walk a utf16le string and return the number of bytes that the string will * be after being converted to the given charset, not including any null * termination required. Don't walk past maxbytes in the source buffer. */ int -cifs_ucs2_bytes(const __le16 *from, int maxbytes, +cifs_utf16_bytes(const __le16 *from, int maxbytes, const struct nls_table *codepage) { int i; @@ -122,7 +122,7 @@ cp_convert: } /* - * cifs_from_ucs2 - convert utf16le string to local charset + * cifs_from_utf16 - convert utf16le string to local charset * @to - destination buffer * @from - source buffer * @tolen - destination buffer size (in bytes) @@ -130,7 +130,7 @@ cp_convert: * @codepage - codepage to which characters should be converted * @mapchar - should characters be remapped according to the mapchars option? * - * Convert a little-endian ucs2le string (as sent by the server) to a string + * Convert a little-endian utf16le string (as sent by the server) to a string * in the provided codepage. The tolen and fromlen parameters are to ensure * that the code doesn't walk off of the end of the buffer (which is always * a danger if the alignment of the source buffer is off). The destination @@ -139,12 +139,12 @@ cp_convert: * null terminator). * * Note that some windows versions actually send multiword UTF-16 characters - * instead of straight UCS-2. The linux nls routines however aren't able to + * instead of straight UTF16-2. The linux nls routines however aren't able to * deal with those characters properly. In the event that we get some of * those characters, they won't be translated properly. */ int -cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, +cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, const struct nls_table *codepage, bool mapchar) { int i, charlen, safelen; @@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, } /* - * NAME: cifs_strtoUCS() + * NAME: cifs_strtoUTF16() * * FUNCTION: Convert character string to unicode string * */ int -cifs_strtoUCS(__le16 *to, const char *from, int len, +cifs_strtoUTF16(__le16 *to, const char *from, int len, const struct nls_table *codepage) { int charlen; @@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len, for (i = 0; len && *from; i++, from += charlen, len -= charlen) { charlen = codepage->char2uni(from, len, &wchar_to); if (charlen < 1) { - cERROR(1, "strtoUCS: char2uni of 0x%x returned %d", + cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d", *from, charlen); /* A question mark */ wchar_to = 0x003f; @@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len, } /* - * cifs_strndup_from_ucs - copy a string from wire format to the local codepage + * cifs_strndup_from_utf16 - copy a string from wire format to the local + * codepage * @src - source string * @maxlen - don't walk past this many bytes in the source string * @is_unicode - is this a unicode string? @@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len, * error. */ char * -cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, - const struct nls_table *codepage) +cifs_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, const struct nls_table *codepage) { int len; char *dst; if (is_unicode) { - len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage); + len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage); len += nls_nullsize(codepage); dst = kmalloc(len, GFP_KERNEL); if (!dst) return NULL; - cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage, + cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, false); } else { len = strnlen(src, maxlen); @@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, * names are little endian 16 bit Unicode on the wire */ int -cifsConvertToUCS(__le16 *target, const char *source, int srclen, +cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { int i, j, charlen; @@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, wchar_t tmp; if (!mapChars) - return cifs_strtoUCS(target, source, PATH_MAX, cp); + return cifs_strtoUTF16(target, source, PATH_MAX, cp); for (i = 0, j = 0; i < srclen; j++) { src_char = source[i]; @@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, switch (src_char) { case 0: put_unaligned(0, &target[j]); - goto ctoUCS_out; + goto ctoUTF16_out; case ':': dst_char = cpu_to_le16(UNI_COLON); break; @@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, put_unaligned(dst_char, &target[j]); } -ctoUCS_out: +ctoUTF16_out: return i; } diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 6d02fd56056..a513a546700 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[]; #endif /* UNIUPR_NOLOWER */ #ifdef __KERNEL__ -int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, - const struct nls_table *codepage, bool mapchar); -int cifs_ucs2_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage); -int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); -char *cifs_strndup_from_ucs(const char *src, const int maxlen, - const bool is_unicode, - const struct nls_table *codepage); -extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, - const struct nls_table *cp, int mapChars); +int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *codepage, bool mapchar); +int cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage); +int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *); +char *cifs_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, + const struct nls_table *codepage); +extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen, + const struct nls_table *cp, int mapChars); #endif diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 72ddf23ef6f..c1b25448738 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, umode_t group_mask = S_IRWXG; umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; + if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) + return; ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); if (!ppace) { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5d9b9acc5fc..63c460e503b 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); attrptr->length = cpu_to_le16(2 * dlen); blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); - cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); + cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp); return 0; } @@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) kmalloc(attrsize + 1, GFP_KERNEL); if (!ses->domainName) return -ENOMEM; - cifs_from_ucs2(ses->domainName, + cifs_from_utf16(ses->domainName, (__le16 *)blobptr, attrsize, attrsize, nls_cp, false); break; @@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* convert ses->user_name to unicode and uppercase */ - len = strlen(ses->user_name); + len = ses->user_name ? strlen(ses->user_name) : 0; user = kmalloc(2 + (len * 2), GFP_KERNEL); if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; return rc; } - len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); - UniStrupr(user); + + if (len) { + len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); + UniStrupr(user); + } else { + memset(user, '\0', 2); + } rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)user, 2 * len); @@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = -ENOMEM; return rc; } - len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, - nls_cp); + len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, + nls_cp); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)domain, 2 * len); @@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = -ENOMEM; return rc; } - len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, + len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len, nls_cp); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8f1fe324162..b1fd382d195 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -343,9 +343,9 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) * ones are. */ static int -cifs_show_options(struct seq_file *s, struct vfsmount *m) +cifs_show_options(struct seq_file *s, struct dentry *root) { - struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; @@ -393,7 +393,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) cifs_show_address(s, tcon->ses->server); if (!tcon->unix_ext) - seq_printf(s, ",file_mode=0%o,dir_mode=0%o", + seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); if (tcon->seal) @@ -430,7 +430,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) seq_printf(s, ",dynperm"); - if (m->mnt_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & MS_POSIXACL) seq_printf(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_printf(s, ",mfsymlinks"); @@ -488,7 +488,7 @@ static void cifs_umount_begin(struct super_block *sb) } #ifdef CONFIG_CIFS_STATS2 -static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt) +static int cifs_show_stats(struct seq_file *s, struct dentry *root) { /* BB FIXME */ return 0; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 30ff56005d8..fe5ecf1b422 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -44,14 +44,14 @@ extern const struct address_space_operations cifs_addr_ops_smallbuf; /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct inode *, struct dentry *, int, +extern int cifs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); -extern int cifs_mkdir(struct inode *, struct dentry *, int); +extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8238aa13e01..76e7d8b6da1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -169,8 +169,8 @@ struct smb_vol { gid_t linux_gid; uid_t backupuid; gid_t backupgid; - mode_t file_mode; - mode_t dir_mode; + umode_t file_mode; + umode_t dir_mode; unsigned secFlg; bool retry:1; bool intr:1; @@ -879,6 +879,8 @@ require use of the stronger protocol */ #define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ #endif /* UPCALL */ #else /* do not allow weak pw hash */ +#define CIFSSEC_MUST_LANMAN 0 +#define CIFSSEC_MUST_PLNTXT 0 #ifdef CONFIG_CIFS_UPCALL #define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ #else diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6600aa2d2ef..8b7794c3159 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -821,8 +821,8 @@ PsxDelete: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB add path length overrun check */ @@ -893,8 +893,8 @@ DelFileRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ @@ -938,8 +938,8 @@ RmDirRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ @@ -981,8 +981,8 @@ MkDirRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ @@ -1030,8 +1030,8 @@ PsxCreat: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, name, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, name, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -1197,8 +1197,8 @@ OldOpenRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { count = 1; /* account for one byte pad to word boundary */ name_len = - cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), - fileName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), + fileName, PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve check for buffer overruns BB */ @@ -1304,8 +1304,8 @@ openRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { count = 1; /* account for one byte pad to word boundary */ name_len = - cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), - fileName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), + fileName, PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; pSMB->NameLength = cpu_to_le16(name_len); @@ -2649,16 +2649,16 @@ renameRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; pSMB->OldFileName[name_len] = 0x04; /* pad */ /* protocol requires ASCII signature byte on Unicode string */ pSMB->OldFileName[name_len + 1] = 0x00; name_len2 = - cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], - toName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + toName, PATH_MAX, nls_codepage, remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ } else { /* BB improve the check for buffer overruns BB */ @@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon, /* unicode only call */ if (target_name == NULL) { sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid); - len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, + len_of_str = + cifsConvertToUTF16((__le16 *)rename_info->target_name, dummy_string, 24, nls_codepage, remap); } else { - len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, + len_of_str = + cifsConvertToUTF16((__le16 *)rename_info->target_name, target_name, PATH_MAX, nls_codepage, remap); } @@ -2795,17 +2797,17 @@ copyRetry: pSMB->Flags = cpu_to_le16(flags & COPY_TREE); if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName, - fromName, PATH_MAX, nls_codepage, - remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, + fromName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; pSMB->OldFileName[name_len] = 0x04; /* pad */ /* protocol requires ASCII signature byte on Unicode string */ pSMB->OldFileName[name_len + 1] = 0x00; name_len2 = - cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], - toName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + toName, PATH_MAX, nls_codepage, remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ } else { /* BB improve the check for buffer overruns BB */ @@ -2861,9 +2863,9 @@ createSymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX - /* find define for this maxpathcomponent */ - , nls_codepage); + cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage); name_len++; /* trailing null */ name_len *= 2; @@ -2885,9 +2887,9 @@ createSymLinkRetry: data_offset = (char *) (&pSMB->hdr.Protocol) + offset; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len_target = - cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX - /* find define for this maxpathcomponent */ - , nls_codepage); + cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX + /* find define for this maxpathcomponent */ + , nls_codepage); name_len_target++; /* trailing null */ name_len_target *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -2949,8 +2951,8 @@ createHardLinkRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName, - PATH_MAX, nls_codepage, remap); + name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; @@ -2972,8 +2974,8 @@ createHardLinkRetry: data_offset = (char *) (&pSMB->hdr.Protocol) + offset; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len_target = - cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX, - nls_codepage, remap); + cifsConvertToUTF16((__le16 *) data_offset, fromName, + PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3042,8 +3044,8 @@ winCreateHardLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; @@ -3051,8 +3053,8 @@ winCreateHardLinkRetry: pSMB->OldFileName[name_len] = 0x04; pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ name_len2 = - cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], - toName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], + toName, PATH_MAX, nls_codepage, remap); name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; name_len2 *= 2; /* convert to bytes */ } else { /* BB improve the check for buffer overruns BB */ @@ -3108,8 +3110,8 @@ querySymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage); + cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3166,8 +3168,8 @@ querySymLinkRetry: is_unicode = false; /* BB FIXME investigate remapping reserved chars here */ - *symlinkinfo = cifs_strndup_from_ucs(data_start, count, - is_unicode, nls_codepage); + *symlinkinfo = cifs_strndup_from_utf16(data_start, + count, is_unicode, nls_codepage); if (!*symlinkinfo) rc = -ENOMEM; } @@ -3450,8 +3452,9 @@ queryAclRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; pSMB->FileName[name_len] = 0; @@ -3537,8 +3540,8 @@ setAclRetry: return rc; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3948,8 +3951,9 @@ QInfRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { @@ -4086,8 +4090,8 @@ QPathInfoRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -4255,8 +4259,8 @@ UnixQPathInfoRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -4344,8 +4348,8 @@ findFirstRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); /* We can not add the asterik earlier in case it got remapped to 0xF03A as if it were part of the directory name instead of a wildcard */ @@ -4656,8 +4660,9 @@ GetInodeNumberRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, rc = -ENOMEM; goto parse_DFS_referrals_exit; } - cifsConvertToUCS((__le16 *) tmp, searchName, - PATH_MAX, nls_codepage, remap); - node->path_consumed = cifs_ucs2_bytes(tmp, + cifsConvertToUTF16((__le16 *) tmp, searchName, + PATH_MAX, nls_codepage, remap); + node->path_consumed = cifs_utf16_bytes(tmp, le16_to_cpu(pSMBr->PathConsumed), nls_codepage); kfree(tmp); @@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, /* copy DfsPath */ temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); max_len = data_end - temp; - node->path_name = cifs_strndup_from_ucs(temp, max_len, - is_unicode, nls_codepage); + node->path_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); if (!node->path_name) { rc = -ENOMEM; goto parse_DFS_referrals_exit; @@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, /* copy link target UNC */ temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); max_len = data_end - temp; - node->node_name = cifs_strndup_from_ucs(temp, max_len, - is_unicode, nls_codepage); + node->node_name = cifs_strndup_from_utf16(temp, max_len, + is_unicode, nls_codepage); if (!node->node_name) rc = -ENOMEM; } @@ -4873,8 +4878,9 @@ getDFSRetry: if (ses->capabilities & CAP_UNICODE) { pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; name_len = - cifsConvertToUCS((__le16 *) pSMB->RequestFileName, - searchName, PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->RequestFileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -5506,8 +5512,8 @@ SetEOFRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -5796,8 +5802,8 @@ SetTimesRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -5877,8 +5883,8 @@ SetAttrLgcyRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - ConvertToUCS((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage); + ConvertToUTF16((__le16 *) pSMB->fileName, fileName, + PATH_MAX, nls_codepage); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -6030,8 +6036,8 @@ setPermsRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -6123,8 +6129,8 @@ QAllEAsRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { list_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, + PATH_MAX, nls_codepage, remap); list_len++; /* trailing null */ list_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -6301,8 +6307,8 @@ SetEARetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, - PATH_MAX, nls_codepage, remap); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f3670cf7258..602f77c304c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -38,6 +38,7 @@ #include <asm/processor.h> #include <linux/inet.h> #include <linux/module.h> +#include <keys/user-type.h> #include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" @@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB) static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) { - struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; + struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond; struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; - char *data_area_of_target; - char *data_area_of_buf2; + char *data_area_of_tgt; + char *data_area_of_src; int remaining; - unsigned int byte_count, total_in_buf; - __u16 total_data_size, total_in_buf2; + unsigned int byte_count, total_in_tgt; + __u16 tgt_total_cnt, src_total_cnt, total_in_src; - total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); + src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount); + tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); - if (total_data_size != - get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount)) - cFYI(1, "total data size of primary and secondary t2 differ"); + if (tgt_total_cnt != src_total_cnt) + cFYI(1, "total data count of primary and secondary t2 differ " + "source=%hu target=%hu", src_total_cnt, tgt_total_cnt); - total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); + total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); - remaining = total_data_size - total_in_buf; + remaining = tgt_total_cnt - total_in_tgt; - if (remaining < 0) + if (remaining < 0) { + cFYI(1, "Server sent too much data. tgt_total_cnt=%hu " + "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt); return -EPROTO; + } - if (remaining == 0) /* nothing to do, ignore */ + if (remaining == 0) { + /* nothing to do, ignore */ + cFYI(1, "no more data remains"); return 0; + } - total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount); - if (remaining < total_in_buf2) { + total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount); + if (remaining < total_in_src) cFYI(1, "transact2 2nd response contains too much data"); - } /* find end of first SMB data area */ - data_area_of_target = (char *)&pSMBt->hdr.Protocol + + data_area_of_tgt = (char *)&pSMBt->hdr.Protocol + get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); - /* validate target area */ - data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol + - get_unaligned_le16(&pSMB2->t2_rsp.DataOffset); + /* validate target area */ + data_area_of_src = (char *)&pSMBs->hdr.Protocol + + get_unaligned_le16(&pSMBs->t2_rsp.DataOffset); - data_area_of_target += total_in_buf; + data_area_of_tgt += total_in_tgt; - /* copy second buffer into end of first buffer */ - total_in_buf += total_in_buf2; + total_in_tgt += total_in_src; /* is the result too big for the field? */ - if (total_in_buf > USHRT_MAX) + if (total_in_tgt > USHRT_MAX) { + cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt); return -EPROTO; - put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); + } + put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount); /* fix up the BCC */ byte_count = get_bcc(pTargetSMB); - byte_count += total_in_buf2; + byte_count += total_in_src; /* is the result too big for the field? */ - if (byte_count > USHRT_MAX) + if (byte_count > USHRT_MAX) { + cFYI(1, "coalesced BCC too large (%u)", byte_count); return -EPROTO; + } put_bcc(byte_count, pTargetSMB); byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); - byte_count += total_in_buf2; + byte_count += total_in_src; /* don't allow buffer to overflow */ - if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) + if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count); return -ENOBUFS; + } pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); - memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); + /* copy second buffer into end of first buffer */ + memcpy(data_area_of_tgt, data_area_of_src, total_in_src); - if (remaining == total_in_buf2) { - cFYI(1, "found the last secondary response"); - return 0; /* we are done */ - } else /* more responses to go */ + if (remaining != total_in_src) { + /* more responses to go */ + cFYI(1, "waiting for more secondary responses"); return 1; + } + + /* we are done */ + cFYI(1, "found the last secondary response"); + return 0; } static void @@ -756,10 +773,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); - if (mid) - handle_mid(mid, server, smb_buffer, length); + if (!mid) + return length; - return length; + handle_mid(mid, server, smb_buffer, length); + return 0; } static int @@ -1578,11 +1596,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } } - if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) { - cERROR(1, "Multiuser mounts currently require krb5 " - "authentication!"); +#ifndef CONFIG_KEYS + /* Muliuser mounts require CONFIG_KEYS support */ + if (vol->multiuser) { + cERROR(1, "Multiuser mounts require kernels with " + "CONFIG_KEYS enabled."); goto cifs_parse_mount_err; } +#endif if (vol->UNCip == NULL) vol->UNCip = &vol->UNC[2]; @@ -1981,10 +2002,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) return 0; break; default: + /* NULL username means anonymous session */ + if (ses->user_name == NULL) { + if (!vol->nullauth) + return 0; + break; + } + /* anything else takes username/password */ - if (ses->user_name == NULL) - return 0; - if (strncmp(ses->user_name, vol->username, + if (strncmp(ses->user_name, + vol->username ? vol->username : "", MAX_USERNAME_SIZE)) return 0; if (strlen(vol->username) != 0 && @@ -2039,6 +2066,132 @@ cifs_put_smb_ses(struct cifs_ses *ses) cifs_put_tcp_session(server); } +#ifdef CONFIG_KEYS + +/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */ +#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1) + +/* Populate username and pw fields from keyring if possible */ +static int +cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) +{ + int rc = 0; + char *desc, *delim, *payload; + ssize_t len; + struct key *key; + struct TCP_Server_Info *server = ses->server; + struct sockaddr_in *sa; + struct sockaddr_in6 *sa6; + struct user_key_payload *upayload; + + desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); + if (!desc) + return -ENOMEM; + + /* try to find an address key first */ + switch (server->dstaddr.ss_family) { + case AF_INET: + sa = (struct sockaddr_in *)&server->dstaddr; + sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr); + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)&server->dstaddr; + sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); + break; + default: + cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family); + rc = -EINVAL; + goto out_err; + } + + cFYI(1, "%s: desc=%s", __func__, desc); + key = request_key(&key_type_logon, desc, ""); + if (IS_ERR(key)) { + if (!ses->domainName) { + cFYI(1, "domainName is NULL"); + rc = PTR_ERR(key); + goto out_err; + } + + /* didn't work, try to find a domain key */ + sprintf(desc, "cifs:d:%s", ses->domainName); + cFYI(1, "%s: desc=%s", __func__, desc); + key = request_key(&key_type_logon, desc, ""); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + goto out_err; + } + } + + down_read(&key->sem); + upayload = key->payload.data; + if (IS_ERR_OR_NULL(upayload)) { + rc = upayload ? PTR_ERR(upayload) : -EINVAL; + goto out_key_put; + } + + /* find first : in payload */ + payload = (char *)upayload->data; + delim = strnchr(payload, upayload->datalen, ':'); + cFYI(1, "payload=%s", payload); + if (!delim) { + cFYI(1, "Unable to find ':' in payload (datalen=%d)", + upayload->datalen); + rc = -EINVAL; + goto out_key_put; + } + + len = delim - payload; + if (len > MAX_USERNAME_SIZE || len <= 0) { + cFYI(1, "Bad value from username search (len=%zd)", len); + rc = -EINVAL; + goto out_key_put; + } + + vol->username = kstrndup(payload, len, GFP_KERNEL); + if (!vol->username) { + cFYI(1, "Unable to allocate %zd bytes for username", len); + rc = -ENOMEM; + goto out_key_put; + } + cFYI(1, "%s: username=%s", __func__, vol->username); + + len = key->datalen - (len + 1); + if (len > MAX_PASSWORD_SIZE || len <= 0) { + cFYI(1, "Bad len for password search (len=%zd)", len); + rc = -EINVAL; + kfree(vol->username); + vol->username = NULL; + goto out_key_put; + } + + ++delim; + vol->password = kstrndup(delim, len, GFP_KERNEL); + if (!vol->password) { + cFYI(1, "Unable to allocate %zd bytes for password", len); + rc = -ENOMEM; + kfree(vol->username); + vol->username = NULL; + goto out_key_put; + } + +out_key_put: + up_read(&key->sem); + key_put(key); +out_err: + kfree(desc); + cFYI(1, "%s: returning %d", __func__, rc); + return rc; +} +#else /* ! CONFIG_KEYS */ +static inline int +cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), + struct cifs_ses *ses __attribute__((unused))) +{ + return -ENOSYS; +} +#endif /* CONFIG_KEYS */ + static bool warned_on_ntlm; /* globals init to false automatically */ static struct cifs_ses * @@ -2819,7 +2972,7 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cFYI(1, "file mode: 0x%x dir mode: 0x%x", + cFYI(1, "file mode: 0x%hx dir mode: 0x%hx", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; @@ -2914,18 +3067,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, #define CIFS_DEFAULT_IOSIZE (1024 * 1024) /* - * Windows only supports a max of 60k reads. Default to that when posix - * extensions aren't in force. + * Windows only supports a max of 60kb reads and 65535 byte writes. Default to + * those values when posix extensions aren't in force. In actuality here, we + * use 65536 to allow for a write that is a multiple of 4k. Most servers seem + * to be ok with the extra byte even though Windows doesn't send writes that + * are that large. + * + * Citation: + * + * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx */ #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) +#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536) static unsigned int cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) { __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; - unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : - CIFS_DEFAULT_IOSIZE; + unsigned int wsize; + + /* start with specified wsize, or default */ + if (pvolume_info->wsize) + wsize = pvolume_info->wsize; + else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) + wsize = CIFS_DEFAULT_IOSIZE; + else + wsize = CIFS_DEFAULT_NON_POSIX_WSIZE; /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -3136,10 +3304,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, return -EINVAL; if (volume_info->nullauth) { - cFYI(1, "null user"); - volume_info->username = kzalloc(1, GFP_KERNEL); - if (volume_info->username == NULL) - return -ENOMEM; + cFYI(1, "Anonymous login"); + kfree(volume_info->username); + volume_info->username = NULL; } else if (volume_info->username) { /* BB fixme parse for domain name here */ cFYI(1, "Username: %s", volume_info->username); @@ -3478,7 +3645,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, if (ses->capabilities & CAP_UNICODE) { smb_buffer->Flags2 |= SMBFLG2_UNICODE; length = - cifs_strtoUCS((__le16 *) bcc_ptr, tree, + cifs_strtoUTF16((__le16 *) bcc_ptr, tree, 6 /* max utf8 char length in bytes */ * (/* server len*/ + 256 /* share len */), nls_codepage); bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ @@ -3533,7 +3700,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); - tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, + tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr, bytes_left, is_unicode, nls_codepage); @@ -3657,25 +3824,43 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses, return rc; } +static int +cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) +{ + switch (ses->server->secType) { + case Kerberos: + vol->secFlg = CIFSSEC_MUST_KRB5; + return 0; + case NTLMv2: + vol->secFlg = CIFSSEC_MUST_NTLMV2; + break; + case NTLM: + vol->secFlg = CIFSSEC_MUST_NTLM; + break; + case RawNTLMSSP: + vol->secFlg = CIFSSEC_MUST_NTLMSSP; + break; + case LANMAN: + vol->secFlg = CIFSSEC_MUST_LANMAN; + break; + } + + return cifs_set_cifscreds(vol, ses); +} + static struct cifs_tcon * cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) { + int rc; struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); struct cifs_ses *ses; struct cifs_tcon *tcon = NULL; struct smb_vol *vol_info; - char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */ - /* We used to have this as MAX_USERNAME which is */ - /* way too big now (256 instead of 32) */ vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); - if (vol_info == NULL) { - tcon = ERR_PTR(-ENOMEM); - goto out; - } + if (vol_info == NULL) + return ERR_PTR(-ENOMEM); - snprintf(username, sizeof(username), "krb50x%x", fsuid); - vol_info->username = username; vol_info->local_nls = cifs_sb->local_nls; vol_info->linux_uid = fsuid; vol_info->cred_uid = fsuid; @@ -3685,8 +3870,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) vol_info->local_lease = master_tcon->local_lease; vol_info->no_linux_ext = !master_tcon->unix_ext; - /* FIXME: allow for other secFlg settings */ - vol_info->secFlg = CIFSSEC_MUST_KRB5; + rc = cifs_set_vol_auth(vol_info, master_tcon->ses); + if (rc) { + tcon = ERR_PTR(rc); + goto out; + } /* get a reference for the same TCP session */ spin_lock(&cifs_tcp_ses_lock); @@ -3709,6 +3897,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) if (ses->capabilities & CAP_UNIX) reset_cifs_unix_caps(0, tcon, NULL, vol_info); out: + kfree(vol_info->username); + kfree(vol_info->password); kfree(vol_info); return tcon; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d7eeb9d3ed6..bc7e24420ac 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -136,7 +136,7 @@ cifs_bp_rename_retry: /* Inode operations in similar order to how they appear in Linux file fs.h */ int -cifs_create(struct inode *inode, struct dentry *direntry, int mode, +cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct nameidata *nd) { int rc = -ENOENT; @@ -355,7 +355,7 @@ cifs_create_out: return rc; } -int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, +int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; @@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ - __u32 oplock = 0; + __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; @@ -584,10 +584,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * If either that or op not supported returned, follow * the normal lookup. */ - if ((rc == 0) || (rc == -ENOENT)) + switch (rc) { + case 0: + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + if (newInode && !S_ISREG(newInode->i_mode)) { + CIFSSMBClose(xid, pTcon, fileHandle); + break; + } + case -ENOENT: posix_open = true; - else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) + case -EOPNOTSUPP: + break; + default: pTcon->broken_posix_open = true; + } } if (!posix_open) rc = cifs_get_inode_info_unix(&newInode, full_path, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4dd9283885e..5e64748a291 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -920,16 +920,26 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) for (lockp = &inode->i_flock; *lockp != NULL; \ lockp = &(*lockp)->fl_next) +struct lock_to_push { + struct list_head llist; + __u64 offset; + __u64 length; + __u32 pid; + __u16 netfid; + __u8 type; +}; + static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; - struct cifsLockInfo *lck, *tmp; + unsigned int count = 0, i = 0; int rc = 0, xid, type; + struct list_head locks_to_send, *el; + struct lock_to_push *lck, *tmp; __u64 length; - struct list_head locks_to_send; xid = GetXid(); @@ -940,29 +950,55 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) return rc; } + lock_flocks(); + cifs_for_each_lock(cfile->dentry->d_inode, before) { + if ((*before)->fl_flags & FL_POSIX) + count++; + } + unlock_flocks(); + INIT_LIST_HEAD(&locks_to_send); + /* + * Allocating count locks is enough because no locks can be added to + * the list while we are holding cinode->lock_mutex that protects + * locking operations of this inode. + */ + for (; i < count; i++) { + lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); + if (!lck) { + rc = -ENOMEM; + goto err_out; + } + list_add_tail(&lck->llist, &locks_to_send); + } + + i = 0; + el = locks_to_send.next; lock_flocks(); cifs_for_each_lock(cfile->dentry->d_inode, before) { + if (el == &locks_to_send) { + /* something is really wrong */ + cERROR(1, "Can't push all brlocks!"); + break; + } flock = *before; + if ((flock->fl_flags & FL_POSIX) == 0) + continue; length = 1 + flock->fl_end - flock->fl_start; if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) type = CIFS_RDLCK; else type = CIFS_WRLCK; - - lck = cifs_lock_init(flock->fl_start, length, type, - cfile->netfid); - if (!lck) { - rc = -ENOMEM; - goto send_locks; - } + lck = list_entry(el, struct lock_to_push, llist); lck->pid = flock->fl_pid; - - list_add_tail(&lck->llist, &locks_to_send); + lck->netfid = cfile->netfid; + lck->length = length; + lck->type = type; + lck->offset = flock->fl_start; + i++; + el = el->next; } - -send_locks: unlock_flocks(); list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { @@ -979,11 +1015,18 @@ send_locks: kfree(lck); } +out: cinode->can_cache_brlcks = false; mutex_unlock(&cinode->lock_mutex); FreeXid(xid); return rc; +err_out: + list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { + list_del(&lck->llist); + kfree(lck); + } + goto out; } static int diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e851d5b8931..745da3d0653 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -534,6 +534,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; + /* + * Server can return wrong NumberOfLinks value for directories + * when Unix extensions are disabled - fake it. + */ + fattr->cf_nlink = 2; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -541,9 +546,9 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, /* clear write bits if ATTR_READONLY is set */ if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~(S_IWUGO); - } - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + } fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; @@ -1264,7 +1269,7 @@ unlink_out: return rc; } -int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) +int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) { int rc = 0, tmprc; int xid; @@ -1275,7 +1280,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) struct inode *newinode = NULL; struct cifs_fattr fattr; - cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode); + cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); cifs_sb = CIFS_SB(inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -1322,7 +1327,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) } /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */ - inc_nlink(inode); cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); cifs_fill_uniqueid(inode->i_sb, &fattr); @@ -1355,7 +1359,6 @@ mkdir_retry_old: d_drop(direntry); } else { mkdir_get_info: - inc_nlink(inode); if (pTcon->unix_ext) rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); @@ -1436,6 +1439,11 @@ mkdir_get_info: } } mkdir_out: + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + CIFS_I(inode)->time = 0; kfree(full_path); FreeXid(xid); cifs_put_tlink(tlink); @@ -1475,7 +1483,6 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_put_tlink(tlink); if (!rc) { - drop_nlink(inode); spin_lock(&direntry->d_inode->i_lock); i_size_write(direntry->d_inode, 0); clear_nlink(direntry->d_inode); @@ -1483,12 +1490,15 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) } cifsInode = CIFS_I(direntry->d_inode); - cifsInode->time = 0; /* force revalidate to go get info when - needed */ + /* force revalidate to go get info when needed */ + cifsInode->time = 0; cifsInode = CIFS_I(inode); - cifsInode->time = 0; /* force revalidate to get parent dir info - since cached search results now invalid */ + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + cifsInode->time = 0; direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a090bbe6ee2..e2bbc683e01 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, name.name = scratch_buf; name.len = - cifs_from_ucs2((char *)name.name, (__le16 *)de.name, - UNICODE_NAME_MAX, - min(de.namelen, (size_t)max_len), nlt, - cifs_sb->mnt_cifs_flags & + cifs_from_utf16((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min_t(size_t, de.namelen, + (size_t)max_len), nlt, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); name.len -= nls_nullsize(nlt); } else { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4ec3ee9d72c..551d0c2b973 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) int bytes_ret = 0; /* Copy OS version */ - bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, - nls_cp); + bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32, + nls_cp); bcc_ptr += 2 * bytes_ret; - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, - 32, nls_cp); + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release, + 32, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* trailing null */ - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, - 32, nls_cp); + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, + 32, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* trailing null */ @@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *(bcc_ptr+1) = 0; bytes_ret = 0; } else - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, - 256, nls_cp); + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, + 256, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null terminator */ @@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *bcc_ptr = 0; *(bcc_ptr+1) = 0; } else { - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, + MAX_USERNAME_SIZE, nls_cp); } bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null termination */ @@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy user */ /* BB what about null user mounts - check that we do this BB */ /* copy user */ - if (ses->user_name != NULL) + if (ses->user_name != NULL) { strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); + bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + } /* else null user mount */ - - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ /* copy domain */ - if (ses->domainName != NULL) { strncpy(bcc_ptr, ses->domainName, 256); bcc_ptr += strnlen(ses->domainName, 256); @@ -287,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, cFYI(1, "bleft %d", bleft); kfree(ses->serverOS); - ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); cFYI(1, "serverOS=%s", ses->serverOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; @@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, return; kfree(ses->serverNOS); - ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); cFYI(1, "serverNOS=%s", ses->serverNOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; @@ -305,7 +304,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, return; kfree(ses->serverDomain); - ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); + ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp); cFYI(1, "serverDomain=%s", ses->serverDomain); return; @@ -395,6 +394,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); + if (tioffset > blob_len || tioffset + tilen > blob_len) { + cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen); + return -EINVAL; + } if (tilen) { ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); if (!ses->auth_key.response) { @@ -502,8 +505,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, tmp += 2; } else { int len; - len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, - MAX_USERNAME_SIZE, nls_cp); + len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, + MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); @@ -518,8 +521,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, tmp += 2; } else { int len; - len = cifs_strtoUCS((__le16 *)tmp, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, + MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 80d85088193..d5cd9aa7eac 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16, /* Password cannot be longer than 128 characters */ if (passwd) /* Password must be converted to NT unicode */ - len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + len = cifs_strtoUTF16(wpwd, passwd, 128, codepage); else { len = 0; *wpwd = 0; /* Ensure string is null terminated */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 45f07c46f3e..10d92cf57ab 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -105,7 +105,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, struct cifs_tcon *pTcon; struct super_block *sb; char *full_path; - struct cifs_ntsd *pacl; if (direntry == NULL) return -EIO; @@ -164,23 +163,24 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + struct cifs_ntsd *pacl; pacl = kmalloc(value_size, GFP_KERNEL); if (!pacl) { cFYI(1, "%s: Can't allocate memory for ACL", __func__); rc = -ENOMEM; } else { -#ifdef CONFIG_CIFS_ACL memcpy(pacl, ea_value, value_size); rc = set_cifs_acl(pacl, value_size, direntry->d_inode, full_path, CIFS_ACL_DACL); if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); + } #else cFYI(1, "Set CIFS ACL not supported yet"); #endif /* CONFIG_CIFS_ACL */ - } } else { int temp; temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 6475877b076..911cf30d057 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, - link the two up if this is needed - fill in the attributes */ -int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb) +struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb) { struct coda_vattr attr; + struct inode *inode; int error; /* We get inode numbers from Venus -- see venus source */ error = venus_getattr(sb, fid, &attr); - if ( error ) { - *inode = NULL; - return error; - } + if (error) + return ERR_PTR(error); - *inode = coda_iget(sb, fid, &attr); - if ( IS_ERR(*inode) ) { + inode = coda_iget(sb, fid, &attr); + if (IS_ERR(inode)) printk("coda_cnode_make: coda_iget failed\n"); - return PTR_ERR(*inode); - } - return 0; + return inode; } @@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) } /* the CONTROL inode is made without asking attributes from Venus */ -int coda_cnode_makectl(struct inode **inode, struct super_block *sb) +struct inode *coda_cnode_makectl(struct super_block *sb) { - int error = -ENOMEM; - - *inode = new_inode(sb); - if (*inode) { - (*inode)->i_ino = CTL_INO; - (*inode)->i_op = &coda_ioctl_inode_operations; - (*inode)->i_fop = &coda_ioctl_operations; - (*inode)->i_mode = 0444; - error = 0; + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = CTL_INO; + inode->i_op = &coda_ioctl_inode_operations; + inode->i_fop = &coda_ioctl_operations; + inode->i_mode = 0444; + return inode; } - - return error; + return ERR_PTR(-ENOMEM); } diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index e35071b1de0..b24fdfd8a3f 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -49,9 +49,9 @@ struct coda_file_info { #define C_DYING 0x4 /* from venus (which died) */ #define C_PURGE 0x8 -int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *); +struct inode *coda_cnode_make(struct CodaFid *, struct super_block *); struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); -int coda_cnode_makectl(struct inode **inode, struct super_block *sb); +struct inode *coda_cnode_makectl(struct super_block *sb); struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 28e7e135cfa..17751582906 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -30,14 +30,14 @@ #include "coda_int.h" /* dir inode-ops */ -static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd); +static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd); static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd); static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, struct dentry *entry); static int coda_unlink(struct inode *dir_inode, struct dentry *entry); static int coda_symlink(struct inode *dir_inode, struct dentry *entry, const char *symname); -static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode); +static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode); static int coda_rmdir(struct inode *dir_inode, struct dentry *entry); static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry); @@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = { /* access routines: lookup, readlink, permission */ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd) { - struct inode *inode = NULL; - struct CodaFid resfid = { { 0, } }; - int type = 0; - int error = 0; + struct super_block *sb = dir->i_sb; const char *name = entry->d_name.name; size_t length = entry->d_name.len; + struct inode *inode; + int type = 0; if (length > CODA_MAXNAMLEN) { printk(KERN_ERR "name too long: lookup, %s (%*s)\n", @@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc /* control object, create inode on the fly */ if (coda_isroot(dir) && coda_iscontrol(name, length)) { - error = coda_cnode_makectl(&inode, dir->i_sb); + inode = coda_cnode_makectl(sb); type = CODA_NOCACHE; - goto exit; + } else { + struct CodaFid fid = { { 0, } }; + int error = venus_lookup(sb, coda_i2f(dir), name, length, + &type, &fid); + inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error); } - error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, - &type, &resfid); - if (!error) - error = coda_cnode_make(&inode, &resfid, dir->i_sb); - - if (error && error != -ENOENT) - return ERR_PTR(error); - -exit: - if (inode && (type & CODA_NOCACHE)) + if (!IS_ERR(inode) && (type & CODA_NOCACHE)) coda_flag_inode(inode, C_VATTR | C_PURGE); + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; + return d_splice_alias(inode, entry); } @@ -191,7 +188,7 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd) +static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, struct nameidata *nd) { int error; const char *name=de->d_name.name; @@ -223,7 +220,7 @@ err_out: return error; } -static int coda_mkdir(struct inode *dir, struct dentry *de, int mode) +static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 871b2771546..5e2e1b3f068 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -58,7 +58,6 @@ static struct inode *coda_alloc_inode(struct super_block *sb) static void coda_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(coda_inode_cachep, ITOC(inode)); } @@ -205,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); /* make root inode */ - error = coda_cnode_make(&root, &fid, sb); - if ( error || !root ) { - printk("Failure of coda_cnode_make for root: error %d\n", error); - goto error; + root = coda_cnode_make(&fid, sb); + if (IS_ERR(root)) { + error = PTR_ERR(root); + printk("Failure of coda_cnode_make for root: error %d\n", error); + root = NULL; + goto error; } printk("coda_read_super: rootinode is %ld dev %s\n", diff --git a/fs/compat.c b/fs/compat.c index c98787536bb..07880bae28a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { - compat_ino_t ino = stat->ino; - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - int err; + struct compat_stat tmp; - SET_UID(uid, stat->uid); - SET_GID(gid, stat->gid); + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; - if ((u64) stat->size > MAX_NON_LFS || - !old_valid_dev(stat->dev) || - !old_valid_dev(stat->rdev)) + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - - if (clear_user(ubuf, sizeof(*ubuf))) - return -EFAULT; - - err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); - err |= __put_user(ino, &ubuf->st_ino); - err |= __put_user(stat->mode, &ubuf->st_mode); - err |= __put_user(stat->nlink, &ubuf->st_nlink); - err |= __put_user(uid, &ubuf->st_uid); - err |= __put_user(gid, &ubuf->st_gid); - err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); - err |= __put_user(stat->size, &ubuf->st_size); - err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &ubuf->st_blksize); - err |= __put_user(stat->blocks, &ubuf->st_blocks); - return err; + SET_UID(tmp.st_uid, stat->uid); + SET_GID(tmp.st_gid, stat->gid); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } asmlinkage long compat_sys_newstat(const char __user * filename, @@ -342,16 +336,9 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c */ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) { - struct super_block *sb; struct compat_ustat tmp; struct kstatfs sbuf; - int err; - - sb = user_get_super(new_decode_dev(dev)); - if (!sb) - return -EINVAL; - err = statfs_by_dentry(sb->s_root, &sbuf); - drop_super(sb); + int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; @@ -1288,7 +1275,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, * O_LARGEFILE flag. */ asmlinkage long -compat_sys_open(const char __user *filename, int flags, int mode) +compat_sys_open(const char __user *filename, int flags, umode_t mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } @@ -1298,7 +1285,7 @@ compat_sys_open(const char __user *filename, int flags, int mode) * O_LARGEFILE flag. */ asmlinkage long -compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode) { return do_sys_open(dfd, filename, flags, mode); } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 32200c2ca7f..a26bea10e81 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1507,35 +1507,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, return -ENOIOCTLCMD; } -static void compat_ioctl_error(struct file *filp, unsigned int fd, - unsigned int cmd, unsigned long arg) -{ - char buf[10]; - char *fn = "?"; - char *path; - - /* find the name of the device. */ - path = (char *)__get_free_page(GFP_KERNEL); - if (path) { - fn = d_path(&filp->f_path, path, PAGE_SIZE); - if (IS_ERR(fn)) - fn = "?"; - } - - sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); - if (!isprint(buf[1])) - sprintf(buf, "%02x", buf[1]); - compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " - "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", - current->comm, current->pid, - (int)fd, (unsigned int)cmd, buf, - (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, - (unsigned int)arg, fn); - - if (path) - free_page((unsigned long)path); -} - static int compat_ioctl_check_table(unsigned int xcmd) { int i; @@ -1622,13 +1593,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto found_handler; error = do_ioctl_trans(fd, cmd, arg, filp); - if (error == -ENOIOCTLCMD) { - static int count; - - if (++count <= 50) - compat_ioctl_error(filp, fd, cmd, arg); - error = -EINVAL; - } + if (error == -ENOIOCTLCMD) + error = -ENOTTY; goto out_fput; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 82bda8fdfc1..ede857d20a0 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -63,8 +63,8 @@ extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); -extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *); +extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); extern int configfs_inode_init(void); extern void configfs_inode_exit(void); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 9a37a9b6de3..5ddd7ebd9dc 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -311,8 +311,8 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry) if (item->ci_parent) parent = item->ci_parent->ci_dentry; - else if (configfs_mount && configfs_mount->mnt_sb) - parent = configfs_mount->mnt_sb->s_root; + else if (configfs_mount) + parent = configfs_mount->mnt_root; else return -EFAULT; @@ -1170,7 +1170,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys, } EXPORT_SYMBOL(configfs_undepend_item); -static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 9d8715c45f2..3ee36d41886 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -116,7 +116,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) return error; } -static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -132,7 +132,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_ctime = iattr->ia_ctime; } -struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) +struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { @@ -185,7 +185,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, #endif /* CONFIG_LOCKDEP */ -int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) +int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *)) { int error = 0; struct inode * inode = NULL; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 739fb59bcdc..a2ee8f9f5a3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -20,7 +20,6 @@ #include <linux/cramfs_fs.h> #include <linux/slab.h> #include <linux/cramfs_fs_sb.h> -#include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/mutex.h> @@ -378,7 +377,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) unsigned long nextoffset; char *name; ino_t ino; - mode_t mode; + umode_t mode; int namelen, error; mutex_lock(&read_mutex); diff --git a/fs/dcache.c b/fs/dcache.c index 89509b5a090..bcbdb33fcc2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -38,6 +38,7 @@ #include <linux/prefetch.h> #include <linux/ratelimit.h> #include "internal.h" +#include "mount.h" /* * Usage: @@ -103,7 +104,7 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(struct dentry *parent, +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned long hash) { hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; @@ -136,6 +137,26 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, } #endif +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, + const unsigned char *ct, size_t tcount) +{ + if (scount != tcount) + return 1; + + do { + if (*cs != *ct) + return 1; + cs++; + ct++; + tcount--; + } while (tcount); + return 0; +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); @@ -242,6 +263,7 @@ static void dentry_lru_add(struct dentry *dentry) static void __dentry_lru_del(struct dentry *dentry) { list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; } @@ -275,15 +297,15 @@ static void dentry_lru_prune(struct dentry *dentry) } } -static void dentry_lru_move_tail(struct dentry *dentry) +static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); if (list_empty(&dentry->d_lru)) { - list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_add_tail(&dentry->d_lru, list); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; } else { - list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + list_move_tail(&dentry->d_lru, list); } spin_unlock(&dcache_lru_lock); } @@ -769,14 +791,18 @@ static void shrink_dentry_list(struct list_head *list) } /** - * __shrink_dcache_sb - shrink the dentry LRU on a given superblock - * @sb: superblock to shrink dentry LRU. - * @count: number of entries to prune - * @flags: flags to control the dentry processing + * prune_dcache_sb - shrink the dcache + * @sb: superblock + * @count: number of entries to try to free * - * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. + * Attempt to shrink the superblock dcache LRU by @count entries. This is + * done when we need more memory an called from the superblock shrinker + * function. + * + * This function may fail to free any resources if all the dentries are in + * use. */ -static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) +void prune_dcache_sb(struct super_block *sb, int count) { struct dentry *dentry; LIST_HEAD(referenced); @@ -795,18 +821,13 @@ relock: goto relock; } - /* - * If we are honouring the DCACHE_REFERENCED flag and the - * dentry has this flag set, don't free it. Clear the flag - * and put it back on the LRU. - */ - if (flags & DCACHE_REFERENCED && - dentry->d_flags & DCACHE_REFERENCED) { + if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; list_move(&dentry->d_lru, &referenced); spin_unlock(&dentry->d_lock); } else { list_move_tail(&dentry->d_lru, &tmp); + dentry->d_flags |= DCACHE_SHRINK_LIST; spin_unlock(&dentry->d_lock); if (!--count) break; @@ -821,23 +842,6 @@ relock: } /** - * prune_dcache_sb - shrink the dcache - * @sb: superblock - * @nr_to_scan: number of entries to try to free - * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker - * function. - * - * This function may fail to free any resources if all the dentries are in - * use. - */ -void prune_dcache_sb(struct super_block *sb, int nr_to_scan) -{ - __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED); -} - -/** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * @@ -1091,7 +1095,7 @@ EXPORT_SYMBOL(have_submounts); * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry * parent) +static int select_parent(struct dentry *parent, struct list_head *dispose) { struct dentry *this_parent; struct list_head *next; @@ -1113,17 +1117,21 @@ resume: spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache + /* + * move only zero ref count dentries to the dispose list. + * + * Those which are presently on the shrink list, being processed + * by shrink_dentry_list(), shouldn't be moved. Otherwise the + * loop in shrink_dcache_parent() might not make any progress + * and loop forever. */ - if (!dentry->d_count) { - dentry_lru_move_tail(dentry); - found++; - } else { + if (dentry->d_count) { dentry_lru_del(dentry); + } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + dentry_lru_move_list(dentry, dispose); + dentry->d_flags |= DCACHE_SHRINK_LIST; + found++; } - /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find @@ -1180,14 +1188,13 @@ rename_retry: * * Prune the dcache to remove unused children of the parent dentry. */ - void shrink_dcache_parent(struct dentry * parent) { - struct super_block *sb = parent->d_sb; + LIST_HEAD(dispose); int found; - while ((found = select_parent(parent)) != 0) - __shrink_dcache_sb(sb, found, 0); + while ((found = select_parent(parent, &dispose)) != 0) + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1460,6 +1467,23 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +struct dentry *d_make_root(struct inode *root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + static const struct qstr name = { .name = "/", .len = 1 }; + + res = __d_alloc(root_inode->i_sb, &name); + if (res) + d_instantiate(res, root_inode); + else + iput(root_inode); + } + return res; +} +EXPORT_SYMBOL(d_make_root); + static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; @@ -1471,7 +1495,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode) return alias; } -static struct dentry * d_find_any_alias(struct inode *inode) +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; @@ -1480,7 +1511,7 @@ static struct dentry * d_find_any_alias(struct inode *inode) spin_unlock(&inode->i_lock); return de; } - +EXPORT_SYMBOL(d_find_any_alias); /** * d_obtain_alias - find or allocate a dentry for a given inode @@ -1706,8 +1737,9 @@ EXPORT_SYMBOL(d_add_ci); * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. */ -struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, - unsigned *seq, struct inode **inode) +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp, struct inode **inode) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -1737,6 +1769,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; struct inode *i; const char *tname; int tlen; @@ -1745,7 +1778,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, continue; seqretry: - *seq = read_seqcount_begin(&dentry->d_seq); + seq = read_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) @@ -1760,7 +1793,7 @@ seqretry: * edge of memory when walking. If we could load this * atomically some other way, we could drop this check. */ - if (read_seqcount_retry(&dentry->d_seq, *seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto seqretry; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, @@ -1777,6 +1810,7 @@ seqretry: * order to do anything useful with the returned dentry * anyway. */ + *seqp = seq; *inode = i; return dentry; } @@ -2451,6 +2485,7 @@ static int prepend_path(const struct path *path, { struct dentry *dentry = path->dentry; struct vfsmount *vfsmnt = path->mnt; + struct mount *mnt = real_mount(vfsmnt); bool slash = false; int error = 0; @@ -2460,11 +2495,11 @@ static int prepend_path(const struct path *path, if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { /* Global root? */ - if (vfsmnt->mnt_parent == vfsmnt) { + if (!mnt_has_parent(mnt)) goto global_root; - } - dentry = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + vfsmnt = &mnt->mnt; continue; } parent = dentry->d_parent; @@ -2501,7 +2536,7 @@ global_root: if (!slash) error = prepend(buffer, buflen, "/", 1); if (!error) - error = vfsmnt->mnt_ns ? 1 : 2; + error = real_mount(vfsmnt)->mnt_ns ? 1 : 2; goto out; } @@ -2853,31 +2888,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) return result; } -int path_is_under(struct path *path1, struct path *path2) -{ - struct vfsmount *mnt = path1->mnt; - struct dentry *dentry = path1->dentry; - int res; - - br_read_lock(vfsmount_lock); - if (mnt != path2->mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) { - br_read_unlock(vfsmount_lock); - return 0; - } - if (mnt->mnt_parent == path2->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - res = is_subdir(dentry, path2->dentry); - br_read_unlock(vfsmount_lock); - return res; -} -EXPORT_SYMBOL(path_is_under); - void d_genocide(struct dentry *root) { struct dentry *this_parent; @@ -2981,7 +2991,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2999,13 +3009,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3029,7 +3039,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 90f76575c05..ef023eef046 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -15,9 +15,11 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/debugfs.h> +#include <linux/io.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -95,7 +97,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u8(const char *name, mode_t mode, +struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { /* if there are no write bits set, make read only */ @@ -147,7 +149,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u16(const char *name, mode_t mode, +struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { /* if there are no write bits set, make read only */ @@ -199,7 +201,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u32(const char *name, mode_t mode, +struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { /* if there are no write bits set, make read only */ @@ -252,7 +254,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_u64(const char *name, mode_t mode, +struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { /* if there are no write bits set, make read only */ @@ -298,7 +300,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x8(const char *name, mode_t mode, +struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { /* if there are no write bits set, make read only */ @@ -322,7 +324,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x16(const char *name, mode_t mode, +struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { /* if there are no write bits set, make read only */ @@ -346,7 +348,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x32(const char *name, mode_t mode, +struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { /* if there are no write bits set, make read only */ @@ -370,7 +372,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x64(const char *name, mode_t mode, +struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { return debugfs_create_file(name, mode, parent, value, &fops_x64); @@ -401,7 +403,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_size_t(const char *name, mode_t mode, +struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { return debugfs_create_file(name, mode, parent, value, &fops_size_t); @@ -473,7 +475,7 @@ static const struct file_operations fops_bool = { * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_bool(const char *name, mode_t mode, +struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, u32 *value) { return debugfs_create_file(name, mode, parent, value, &fops_bool); @@ -518,10 +520,103 @@ static const struct file_operations fops_blob = { * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ -struct dentry *debugfs_create_blob(const char *name, mode_t mode, +struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file(name, mode, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); + +#ifdef CONFIG_HAS_IOMEM + +/* + * The regset32 stuff is used to print 32-bit registers using the + * seq_file utilities. We offer printing a register set in an already-opened + * sequential file or create a debugfs file that only prints a regset32. + */ + +/** + * debugfs_print_regs32 - use seq_print to describe a set of registers + * @s: the seq_file structure being used to generate output + * @regs: an array if struct debugfs_reg32 structures + * @nregs: the length of the above array + * @base: the base address to be used in reading the registers + * @prefix: a string to be prefixed to every output line + * + * This function outputs a text block describing the current values of + * some 32-bit hardware registers. It is meant to be used within debugfs + * files based on seq_file that need to show registers, intermixed with other + * information. The prefix argument may be used to specify a leading string, + * because some peripherals have several blocks of identical registers, + * for example configuration of dma channels + */ +int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix) +{ + int i, ret = 0; + + for (i = 0; i < nregs; i++, regs++) { + if (prefix) + ret += seq_printf(s, "%s", prefix); + ret += seq_printf(s, "%s = 0x%08x\n", regs->name, + readl(base + regs->offset)); + } + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_print_regs32); + +static int debugfs_show_regset32(struct seq_file *s, void *data) +{ + struct debugfs_regset32 *regset = s->private; + + debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); + return 0; +} + +static int debugfs_open_regset32(struct inode *inode, struct file *file) +{ + return single_open(file, debugfs_show_regset32, inode->i_private); +} + +static const struct file_operations fops_regset32 = { + .open = debugfs_open_regset32, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * debugfs_create_regset32 - create a debugfs file that returns register values + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @regset: a pointer to a struct debugfs_regset32, which contains a pointer + * to an array of register definitions, the array size and the base + * address where the register bank is to be found. + * + * This function creates a file in debugfs with the given name that reports + * the names and values of a set of 32-bit registers. If the @mode variable + * is so set it can be read from. Writing is not supported. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_regset32(const char *name, mode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset) +{ + return debugfs_create_file(name, mode, parent, regset, &fops_regset32); +} +EXPORT_SYMBOL_GPL(debugfs_create_regset32); + +#endif /* CONFIG_HAS_IOMEM */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f3a257d7a98..956d5ddddf6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -30,7 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev, +static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, void *data, const struct file_operations *fops) { @@ -69,7 +69,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d /* SMP-safe */ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev, void *data, + umode_t mode, dev_t dev, void *data, const struct file_operations *fops) { struct inode *inode; @@ -87,7 +87,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry, return error; } -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { int res; @@ -101,14 +101,14 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode, return res; } -static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { mode = (mode & S_IALLUGO) | S_IFLNK; return debugfs_mknod(dir, dentry, mode, 0, data, fops); } -static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, void *data, const struct file_operations *fops) { int res; @@ -146,7 +146,7 @@ static struct file_system_type debug_fs_type = { .kill_sb = kill_litter_super, }; -static int debugfs_create_by_name(const char *name, mode_t mode, +static int debugfs_create_by_name(const char *name, umode_t mode, struct dentry *parent, struct dentry **dentry, void *data, @@ -160,7 +160,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * have around. */ if (!parent) - parent = debugfs_mount->mnt_sb->s_root; + parent = debugfs_mount->mnt_root; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); @@ -214,7 +214,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ -struct dentry *debugfs_create_file(const char *name, mode_t mode, +struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d5d5297efe9..c4e2a58a2e8 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -246,9 +246,9 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) return err; } -static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int devpts_show_options(struct seq_file *seq, struct dentry *root) { - struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb); + struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb); struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) @@ -301,7 +301,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode = new_inode(s); if (!inode) - goto free_fsi; + goto fail; inode->i_ino = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent) printk(KERN_ERR "devpts: get root dentry failed\n"); iput(inode); -free_fsi: - kfree(s->s_fs_info); fail: return -ENOMEM; } diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6..f4aadd15b61 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> +#include <linux/prefetch.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -172,7 +173,7 @@ void inode_dio_wait(struct inode *inode) if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } -EXPORT_SYMBOL_GPL(inode_dio_wait); +EXPORT_SYMBOL(inode_dio_wait); /* * inode_dio_done - signal finish of a direct I/O requests @@ -186,7 +187,7 @@ void inode_dio_done(struct inode *inode) if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -EXPORT_SYMBOL_GPL(inode_dio_done); +EXPORT_SYMBOL(inode_dio_done); /* * How many pages are in the queue? @@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; @@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 6cf72fcc0d0..e7e327d43fa 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> +#include <linux/dlmconstants.h> #include <net/ipv6.h> #include <net/sock.h> @@ -36,6 +37,7 @@ static struct config_group *space_list; static struct config_group *comm_list; static struct dlm_comm *local_comm; +static uint32_t dlm_comm_count; struct dlm_clusters; struct dlm_cluster; @@ -103,6 +105,8 @@ struct dlm_cluster { unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; unsigned int cl_new_rsb_count; + unsigned int cl_recover_callbacks; + char cl_cluster_name[DLM_LOCKSPACE_LEN]; }; enum { @@ -118,6 +122,8 @@ enum { CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, CLUSTER_ATTR_NEW_RSB_COUNT, + CLUSTER_ATTR_RECOVER_CALLBACKS, + CLUSTER_ATTR_CLUSTER_NAME, }; struct cluster_attribute { @@ -126,6 +132,27 @@ struct cluster_attribute { ssize_t (*store)(struct dlm_cluster *, const char *, size_t); }; +static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) +{ + return sprintf(buf, "%s\n", cl->cl_cluster_name); +} + +static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, + const char *buf, size_t len) +{ + strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); + strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); + return len; +} + +static struct cluster_attribute cluster_attr_cluster_name = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "cluster_name", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = cluster_cluster_name_read, + .store = cluster_cluster_name_write, +}; + static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, int *info_field, int check_zero, const char *buf, size_t len) @@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); CLUSTER_ATTR(new_rsb_count, 0); +CLUSTER_ATTR(recover_callbacks, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, @@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, + [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr, + [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr, NULL, }; @@ -293,6 +323,7 @@ struct dlm_comms { struct dlm_comm { struct config_item item; + int seq; int nodeid; int local; int addr_count; @@ -309,6 +340,7 @@ struct dlm_node { int nodeid; int weight; int new; + int comm_seq; /* copy of cm->seq when nd->nodeid is set */ }; static struct configfs_group_operations clusters_ops = { @@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; + cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; + memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, + DLM_LOCKSPACE_LEN); space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name) return ERR_PTR(-ENOMEM); config_item_init_type_name(&cm->item, name, &comm_type); + + cm->seq = dlm_comm_count++; + if (!cm->seq) + cm->seq = dlm_comm_count++; + cm->nodeid = -1; cm->local = 0; cm->addr_count = 0; @@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf) static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len) { + uint32_t seq = 0; nd->nodeid = simple_strtol(buf, NULL, 0); + dlm_comm_seq(nd->nodeid, &seq); + nd->comm_seq = seq; return len; } @@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm) } /* caller must free mem */ -int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, - int **new_out, int *new_count_out) +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out) { struct dlm_space *sp; struct dlm_node *nd; - int i = 0, rv = 0, ids_count = 0, new_count = 0; - int *ids, *new; + struct dlm_config_node *nodes, *node; + int rv, count; sp = get_space(lsname); if (!sp) @@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, goto out; } - ids_count = sp->members_count; + count = sp->members_count; - ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); - if (!ids) { + nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS); + if (!nodes) { rv = -ENOMEM; goto out; } + node = nodes; list_for_each_entry(nd, &sp->members, list) { - ids[i++] = nd->nodeid; - if (nd->new) - new_count++; - } - - if (ids_count != i) - printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i); - - if (!new_count) - goto out_ids; + node->nodeid = nd->nodeid; + node->weight = nd->weight; + node->new = nd->new; + node->comm_seq = nd->comm_seq; + node++; - new = kcalloc(new_count, sizeof(int), GFP_NOFS); - if (!new) { - kfree(ids); - rv = -ENOMEM; - goto out; + nd->new = 0; } - i = 0; - list_for_each_entry(nd, &sp->members, list) { - if (nd->new) { - new[i++] = nd->nodeid; - nd->new = 0; - } - } - *new_count_out = new_count; - *new_out = new; - - out_ids: - *ids_count_out = ids_count; - *ids_out = ids; + *count_out = count; + *nodes_out = nodes; + rv = 0; out: mutex_unlock(&sp->members_lock); put_space(sp); return rv; } -int dlm_node_weight(char *lsname, int nodeid) +int dlm_comm_seq(int nodeid, uint32_t *seq) { - struct dlm_space *sp; - struct dlm_node *nd; - int w = -EEXIST; - - sp = get_space(lsname); - if (!sp) - goto out; - - mutex_lock(&sp->members_lock); - list_for_each_entry(nd, &sp->members, list) { - if (nd->nodeid != nodeid) - continue; - w = nd->weight; - break; - } - mutex_unlock(&sp->members_lock); - put_space(sp); - out: - return w; + struct dlm_comm *cm = get_comm(nodeid, NULL); + if (!cm) + return -EEXIST; + *seq = cm->seq; + put_comm(cm); + return 0; } int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) @@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 #define DEFAULT_NEW_RSB_COUNT 128 +#define DEFAULT_RECOVER_CALLBACKS 0 +#define DEFAULT_CLUSTER_NAME "" struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, @@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = { .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, .ci_waitwarn_us = DEFAULT_WAITWARN_US, - .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT + .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, + .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, + .ci_cluster_name = DEFAULT_CLUSTER_NAME }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 3099d0dd26c..9f5e3663bb0 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -14,6 +14,13 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +struct dlm_config_node { + int nodeid; + int weight; + int new; + uint32_t comm_seq; +}; + #define DLM_MAX_ADDR_COUNT 3 struct dlm_config_info { @@ -29,15 +36,17 @@ struct dlm_config_info { int ci_timewarn_cs; int ci_waitwarn_us; int ci_new_rsb_count; + int ci_recover_callbacks; + char ci_cluster_name[DLM_LOCKSPACE_LEN]; }; extern struct dlm_config_info dlm_config; int dlm_config_init(void); void dlm_config_exit(void); -int dlm_node_weight(char *lsname, int nodeid); -int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, - int **new_out, int *new_count_out); +int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, + int *count_out); +int dlm_comm_seq(int nodeid, uint32_t *seq); int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); int dlm_our_nodeid(void); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 59779237e2b..3dca2b39e83 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { + struct rb_node *node; struct dlm_ls *ls = seq->private; struct rsbtbl_iter *ri; struct dlm_rsb *r; @@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) ri->format = 3; spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, - res_hashchain) { + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node; + node = rb_next(node)) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); if (!entry--) { dlm_hold_rsb(r); ri->rsb = r; @@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) } spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - r = list_first_entry(&ls->ls_rsbtbl[bucket].list, - struct dlm_rsb, res_hashchain); + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + node = rb_first(&ls->ls_rsbtbl[bucket].keep); + r = rb_entry(node, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; ri->bucket = bucket; @@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct dlm_ls *ls = seq->private; struct rsbtbl_iter *ri = iter_ptr; - struct list_head *next; + struct rb_node *next; struct dlm_rsb *r, *rp; loff_t n = *pos; unsigned bucket; @@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) spin_lock(&ls->ls_rsbtbl[bucket].lock); rp = ri->rsb; - next = rp->res_hashchain.next; + next = rb_next(&rp->res_hashnode); - if (next != &ls->ls_rsbtbl[bucket].list) { - r = list_entry(next, struct dlm_rsb, res_hashchain); + if (next) { + r = rb_entry(next, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; spin_unlock(&ls->ls_rsbtbl[bucket].lock); @@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) } spin_lock(&ls->ls_rsbtbl[bucket].lock); - if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { - r = list_first_entry(&ls->ls_rsbtbl[bucket].list, - struct dlm_rsb, res_hashchain); + if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) { + next = rb_first(&ls->ls_rsbtbl[bucket].keep); + r = rb_entry(next, struct dlm_rsb, res_hashnode); dlm_hold_rsb(r); ri->rsb = r; ri->bucket = bucket; diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 7b84c1dbc82..83641574b01 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls) out_status: error = 0; - dlm_set_recover_status(ls, DLM_RS_DIR); log_debug(ls, "dlm_recover_directory %d entries", count); out_free: kfree(last_name); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index fe2860c0244..3a564d197e9 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -103,8 +103,8 @@ struct dlm_dirtable { }; struct dlm_rsbtable { - struct list_head list; - struct list_head toss; + struct rb_root keep; + struct rb_root toss; spinlock_t lock; }; @@ -117,6 +117,10 @@ struct dlm_member { struct list_head list; int nodeid; int weight; + int slot; + int slot_prev; + int comm_seq; + uint32_t generation; }; /* @@ -125,10 +129,8 @@ struct dlm_member { struct dlm_recover { struct list_head list; - int *nodeids; /* nodeids of all members */ - int node_count; - int *new; /* nodeids of new members */ - int new_count; + struct dlm_config_node *nodes; + int nodes_count; uint64_t seq; }; @@ -285,7 +287,10 @@ struct dlm_rsb { unsigned long res_toss_time; uint32_t res_first_lkid; struct list_head res_lookup; /* lkbs waiting on first */ - struct list_head res_hashchain; /* rsbtbl */ + union { + struct list_head res_hashchain; + struct rb_node res_hashnode; /* rsbtbl */ + }; struct list_head res_grantqueue; struct list_head res_convertqueue; struct list_head res_waitqueue; @@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ #define DLM_HEADER_MAJOR 0x00030000 -#define DLM_HEADER_MINOR 0x00000000 +#define DLM_HEADER_MINOR 0x00000001 + +#define DLM_HEADER_SLOTS 0x00000001 #define DLM_MSG 1 #define DLM_RCOM 2 @@ -422,10 +429,34 @@ union dlm_packet { struct dlm_rcom rcom; }; +#define DLM_RSF_NEED_SLOTS 0x00000001 + +/* RCOM_STATUS data */ +struct rcom_status { + __le32 rs_flags; + __le32 rs_unused1; + __le64 rs_unused2; +}; + +/* RCOM_STATUS_REPLY data */ struct rcom_config { __le32 rf_lvblen; __le32 rf_lsflags; - __le64 rf_unused; + + /* DLM_HEADER_SLOTS adds: */ + __le32 rf_flags; + __le16 rf_our_slot; + __le16 rf_num_slots; + __le32 rf_generation; + __le32 rf_unused1; + __le64 rf_unused2; +}; + +struct rcom_slot { + __le32 ro_nodeid; + __le16 ro_slot; + __le16 ro_unused1; + __le64 ro_unused2; }; struct rcom_lock { @@ -452,6 +483,7 @@ struct dlm_ls { struct list_head ls_list; /* list of lockspaces */ dlm_lockspace_t *ls_local_handle; uint32_t ls_global_id; /* global unique lockspace ID */ + uint32_t ls_generation; uint32_t ls_exflags; int ls_lvblen; int ls_count; /* refcount of processes in @@ -490,6 +522,11 @@ struct dlm_ls { int ls_total_weight; int *ls_node_array; + int ls_slot; + int ls_num_slots; + int ls_slots_size; + struct dlm_slot *ls_slots; + struct dlm_rsb ls_stub_rsb; /* for returning errors */ struct dlm_lkb ls_stub_lkb; /* for returning errors */ struct dlm_message ls_stub_ms; /* for faking a reply */ @@ -537,6 +574,9 @@ struct dlm_ls { struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ + const struct dlm_lockspace_ops *ls_ops; + void *ls_ops_arg; + int ls_namelen; char ls_name[1]; }; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 83b5e32514e..d47183043c5 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -56,6 +56,7 @@ L: receive_xxxx_reply() <- R: send_xxxx_reply() */ #include <linux/types.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include "dlm_internal.h" #include <linux/dlm_device.h> @@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); list_del(&r->res_hashchain); + /* Convert the empty list_head to a NULL rb_node for tree usage: */ + memset(&r->res_hashnode, 0, sizeof(struct rb_node)); ls->ls_new_rsb_count--; spin_unlock(&ls->ls_new_rsb_spin); @@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, memcpy(r->res_name, name, len); mutex_init(&r->res_mutex); - INIT_LIST_HEAD(&r->res_hashchain); INIT_LIST_HEAD(&r->res_lookup); INIT_LIST_HEAD(&r->res_grantqueue); INIT_LIST_HEAD(&r->res_convertqueue); @@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, return 0; } -static int search_rsb_list(struct list_head *head, char *name, int len, +static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) +{ + char maxname[DLM_RESNAME_MAXLEN]; + + memset(maxname, 0, DLM_RESNAME_MAXLEN); + memcpy(maxname, name, nlen); + return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); +} + +static int search_rsb_tree(struct rb_root *tree, char *name, int len, unsigned int flags, struct dlm_rsb **r_ret) { + struct rb_node *node = tree->rb_node; struct dlm_rsb *r; int error = 0; - - list_for_each_entry(r, head, res_hashchain) { - if (len == r->res_length && !memcmp(name, r->res_name, len)) + int rc; + + while (node) { + r = rb_entry(node, struct dlm_rsb, res_hashnode); + rc = rsb_cmp(r, name, len); + if (rc < 0) + node = node->rb_left; + else if (rc > 0) + node = node->rb_right; + else goto found; } *r_ret = NULL; @@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len, return error; } +static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) +{ + struct rb_node **newn = &tree->rb_node; + struct rb_node *parent = NULL; + int rc; + + while (*newn) { + struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb, + res_hashnode); + + parent = *newn; + rc = rsb_cmp(cur, rsb->res_name, rsb->res_length); + if (rc < 0) + newn = &parent->rb_left; + else if (rc > 0) + newn = &parent->rb_right; + else { + log_print("rsb_insert match"); + dlm_dump_rsb(rsb); + dlm_dump_rsb(cur); + return -EEXIST; + } + } + + rb_link_node(&rsb->res_hashnode, parent, newn); + rb_insert_color(&rsb->res_hashnode, tree); + return 0; +} + static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, unsigned int flags, struct dlm_rsb **r_ret) { struct dlm_rsb *r; int error; - error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r); + error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r); if (!error) { kref_get(&r->res_ref); goto out; } - error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); + error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); if (error) goto out; - list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); + error = rsb_insert(r, &ls->ls_rsbtbl[b].keep); + if (error) + return error; if (dlm_no_directory(ls)) goto out; @@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, nodeid = 0; r->res_nodeid = nodeid; } - list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); - error = 0; + error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep); out_unlock: spin_unlock(&ls->ls_rsbtbl[bucket].lock); out: @@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref) DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); kref_init(&r->res_ref); - list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); + rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); @@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r) r->res_name, r->res_length); } -/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is - found since they are in order of newest to oldest? */ +/* FIXME: make this more efficient */ static int shrink_bucket(struct dlm_ls *ls, int b) { + struct rb_node *n; struct dlm_rsb *r; int count = 0, found; for (;;) { found = 0; spin_lock(&ls->ls_rsbtbl[b].lock); - list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, - res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); if (!time_after_eq(jiffies, r->res_toss_time + dlm_config.ci_toss_secs * HZ)) continue; @@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) } if (kref_put(&r->res_ref, kill_rsb)) { - list_del(&r->res_hashchain); + rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); spin_unlock(&ls->ls_rsbtbl[b].lock); if (is_master(r)) @@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls) static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) { + struct rb_node *n; struct dlm_rsb *r, *r_ret = NULL; spin_lock(&ls->ls_rsbtbl[bucket].lock); - list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); if (!rsb_flag(r, RSB_LOCKS_PURGED)) continue; hold_rsb(r); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a1d8f1af144..a1ea25face8 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -386,12 +386,15 @@ static void threads_stop(void) dlm_lowcomms_stop(); } -static int new_lockspace(const char *name, int namelen, void **lockspace, - uint32_t flags, int lvblen) +static int new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) { struct dlm_ls *ls; int i, size, error; int do_unreg = 0; + int namelen = strlen(name); if (namelen > DLM_LOCKSPACE_LEN) return -EINVAL; @@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, return -EINVAL; if (!dlm_user_daemon_available()) { - module_put(THIS_MODULE); - return -EUNATCH; + log_print("dlm user daemon not available"); + error = -EUNATCH; + goto out; + } + + if (ops && ops_result) { + if (!dlm_config.ci_recover_callbacks) + *ops_result = -EOPNOTSUPP; + else + *ops_result = 0; + } + + if (dlm_config.ci_recover_callbacks && cluster && + strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { + log_print("dlm cluster name %s mismatch %s", + dlm_config.ci_cluster_name, cluster); + error = -EBADR; + goto out; } error = 0; @@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, ls->ls_flags = 0; ls->ls_scan_time = jiffies; + if (ops && dlm_config.ci_recover_callbacks) { + ls->ls_ops = ops; + ls->ls_ops_arg = ops_arg; + } + if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); @@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (!ls->ls_rsbtbl) goto out_lsfree; for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); - INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); + ls->ls_rsbtbl[i].keep.rb_node = NULL; + ls->ls_rsbtbl[i].toss.rb_node = NULL; spin_lock_init(&ls->ls_rsbtbl[i].lock); } @@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, if (!ls->ls_recover_buf) goto out_dirfree; + ls->ls_slot = 0; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; + ls->ls_slots = NULL; + INIT_LIST_HEAD(&ls->ls_recover_list); spin_lock_init(&ls->ls_recover_list_lock); ls->ls_recover_list_count = 0; @@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace, return error; } -int dlm_new_lockspace(const char *name, int namelen, void **lockspace, - uint32_t flags, int lvblen) +int dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, void *ops_arg, + int *ops_result, dlm_lockspace_t **lockspace) { int error = 0; @@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace, if (error) goto out; - error = new_lockspace(name, namelen, lockspace, flags, lvblen); + error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg, + ops_result, lockspace); if (!error) ls_count++; if (error > 0) @@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force) static int release_lockspace(struct dlm_ls *ls, int force) { struct dlm_rsb *rsb; - struct list_head *head; + struct rb_node *n; int i, busy, rv; busy = lockspace_busy(ls, force); @@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force) */ for (i = 0; i < ls->ls_rsbtbl_size; i++) { - head = &ls->ls_rsbtbl[i].list; - while (!list_empty(head)) { - rsb = list_entry(head->next, struct dlm_rsb, - res_hashchain); - - list_del(&rsb->res_hashchain); + while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].keep); dlm_free_rsb(rsb); } - head = &ls->ls_rsbtbl[i].toss; - while (!list_empty(head)) { - rsb = list_entry(head->next, struct dlm_rsb, - res_hashchain); - list_del(&rsb->res_hashchain); + while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) { + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + rb_erase(n, &ls->ls_rsbtbl[i].toss); dlm_free_rsb(rsb); } } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 990626e7da8..0b3109ee425 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -281,7 +281,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) } else { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; - ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr); + ret6->sin6_addr = in6->sin6_addr; } return 0; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index b12532e553f..862640a36d5 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -19,6 +19,280 @@ #include "config.h" #include "lowcomms.h" +int dlm_slots_version(struct dlm_header *h) +{ + if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS) + return 0; + return 1; +} + +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb) +{ + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + + if (!dlm_slots_version(&rc->rc_header)) + return; + + memb->slot = le16_to_cpu(rf->rf_our_slot); + memb->generation = le32_to_cpu(rf->rf_generation); +} + +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + struct dlm_slot *slot; + struct rcom_slot *ro; + int i; + + ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + /* ls_slots array is sparse, but not rcom_slots */ + + for (i = 0; i < ls->ls_slots_size; i++) { + slot = &ls->ls_slots[i]; + if (!slot->nodeid) + continue; + ro->ro_nodeid = cpu_to_le32(slot->nodeid); + ro->ro_slot = cpu_to_le16(slot->slot); + ro++; + } +} + +#define SLOT_DEBUG_LINE 128 + +static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) +{ + char line[SLOT_DEBUG_LINE]; + int len = SLOT_DEBUG_LINE - 1; + int pos = 0; + int ret, i; + + if (!dlm_config.ci_log_debug) + return; + + memset(line, 0, sizeof(line)); + + if (array) { + for (i = 0; i < array_size; i++) { + if (!array[i].nodeid) + continue; + + ret = snprintf(line + pos, len - pos, " %d:%d", + array[i].slot, array[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } else if (ro0) { + for (i = 0; i < num_slots; i++) { + ret = snprintf(line + pos, len - pos, " %d:%d", + ro0[i].ro_slot, ro0[i].ro_nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + } + + log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); +} + +int dlm_slots_copy_in(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_rcom *rc = ls->ls_recover_buf; + struct rcom_config *rf = (struct rcom_config *)rc->rc_buf; + struct rcom_slot *ro0, *ro; + int our_nodeid = dlm_our_nodeid(); + int i, num_slots; + uint32_t gen; + + if (!dlm_slots_version(&rc->rc_header)) + return -1; + + gen = le32_to_cpu(rf->rf_generation); + if (gen <= ls->ls_generation) { + log_error(ls, "dlm_slots_copy_in gen %u old %u", + gen, ls->ls_generation); + } + ls->ls_generation = gen; + + num_slots = le16_to_cpu(rf->rf_num_slots); + if (!num_slots) + return -1; + + ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config)); + + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid); + ro->ro_slot = le16_to_cpu(ro->ro_slot); + } + + log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); + + list_for_each_entry(memb, &ls->ls_nodes, list) { + for (i = 0, ro = ro0; i < num_slots; i++, ro++) { + if (ro->ro_nodeid != memb->nodeid) + continue; + memb->slot = ro->ro_slot; + memb->slot_prev = memb->slot; + break; + } + + if (memb->nodeid == our_nodeid) { + if (ls->ls_slot && ls->ls_slot != memb->slot) { + log_error(ls, "dlm_slots_copy_in our slot " + "changed %d %d", ls->ls_slot, + memb->slot); + return -1; + } + + if (!ls->ls_slot) + ls->ls_slot = memb->slot; + } + + if (!memb->slot) { + log_error(ls, "dlm_slots_copy_in nodeid %d no slot", + memb->nodeid); + return -1; + } + } + + return 0; +} + +/* for any nodes that do not support slots, we will not have set memb->slot + in wait_status_all(), so memb->slot will remain -1, and we will not + assign slots or set ls_num_slots here */ + +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out) +{ + struct dlm_member *memb; + struct dlm_slot *array; + int our_nodeid = dlm_our_nodeid(); + int array_size, max_slots, i; + int need = 0; + int max = 0; + int num = 0; + uint32_t gen = 0; + + /* our own memb struct will have slot -1 gen 0 */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->nodeid == our_nodeid) { + memb->slot = ls->ls_slot; + memb->generation = ls->ls_generation; + break; + } + } + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->generation > gen) + gen = memb->generation; + + /* node doesn't support slots */ + + if (memb->slot == -1) + return -1; + + /* node needs a slot assigned */ + + if (!memb->slot) + need++; + + /* node has a slot assigned */ + + num++; + + if (!max || max < memb->slot) + max = memb->slot; + + /* sanity check, once slot is assigned it shouldn't change */ + + if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) { + log_error(ls, "nodeid %d slot changed %d %d", + memb->nodeid, memb->slot_prev, memb->slot); + return -1; + } + memb->slot_prev = memb->slot; + } + + array_size = max + need; + + array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + if (!array) + return -ENOMEM; + + num = 0; + + /* fill in slots (offsets) that are used */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (!memb->slot) + continue; + + if (memb->slot > array_size) { + log_error(ls, "invalid slot number %d", memb->slot); + kfree(array); + return -1; + } + + array[memb->slot - 1].nodeid = memb->nodeid; + array[memb->slot - 1].slot = memb->slot; + num++; + } + + /* assign new slots from unused offsets */ + + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (memb->slot) + continue; + + for (i = 0; i < array_size; i++) { + if (array[i].nodeid) + continue; + + memb->slot = i + 1; + memb->slot_prev = memb->slot; + array[i].nodeid = memb->nodeid; + array[i].slot = memb->slot; + num++; + + if (!ls->ls_slot && memb->nodeid == our_nodeid) + ls->ls_slot = memb->slot; + break; + } + + if (!memb->slot) { + log_error(ls, "no free slot found"); + kfree(array); + return -1; + } + } + + gen++; + + log_debug_slots(ls, gen, num, NULL, array, array_size); + + max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - + sizeof(struct rcom_config)) / sizeof(struct rcom_slot); + + if (num > max_slots) { + log_error(ls, "num_slots %d exceeds max_slots %d", + num, max_slots); + kfree(array); + return -1; + } + + *gen_out = gen; + *slots_out = array; + *slots_size = array_size; + *num_slots = num; + return 0; +} + static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) { struct dlm_member *memb = NULL; @@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) } } -static int dlm_add_member(struct dlm_ls *ls, int nodeid) +static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) { struct dlm_member *memb; - int w, error; + int error; memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); if (!memb) return -ENOMEM; - w = dlm_node_weight(ls->ls_name, nodeid); - if (w < 0) { - kfree(memb); - return w; - } - - error = dlm_lowcomms_connect_node(nodeid); + error = dlm_lowcomms_connect_node(node->nodeid); if (error < 0) { kfree(memb); return error; } - memb->nodeid = nodeid; - memb->weight = w; + memb->nodeid = node->nodeid; + memb->weight = node->weight; + memb->comm_seq = node->comm_seq; add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; } -static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) -{ - list_move(&memb->list, &ls->ls_nodes_gone); - ls->ls_num_nodes--; -} - -int dlm_is_member(struct dlm_ls *ls, int nodeid) +static struct dlm_member *find_memb(struct list_head *head, int nodeid) { struct dlm_member *memb; - list_for_each_entry(memb, &ls->ls_nodes, list) { + list_for_each_entry(memb, head, list) { if (memb->nodeid == nodeid) - return 1; + return memb; } + return NULL; +} + +int dlm_is_member(struct dlm_ls *ls, int nodeid) +{ + if (find_memb(&ls->ls_nodes, nodeid)) + return 1; return 0; } int dlm_is_removed(struct dlm_ls *ls, int nodeid) { - struct dlm_member *memb; - - list_for_each_entry(memb, &ls->ls_nodes_gone, list) { - if (memb->nodeid == nodeid) - return 1; - } + if (find_memb(&ls->ls_nodes_gone, nodeid)) + return 1; return 0; } @@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls) error = dlm_recovery_stopped(ls); if (error) break; - error = dlm_rcom_status(ls, memb->nodeid); + error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) break; } @@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls) return error; } +static void dlm_lsop_recover_prep(struct dlm_ls *ls) +{ + if (!ls->ls_ops || !ls->ls_ops->recover_prep) + return; + ls->ls_ops->recover_prep(ls->ls_ops_arg); +} + +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +{ + struct dlm_slot slot; + uint32_t seq; + int error; + + if (!ls->ls_ops || !ls->ls_ops->recover_slot) + return; + + /* if there is no comms connection with this node + or the present comms connection is newer + than the one when this member was added, then + we consider the node to have failed (versus + being removed due to dlm_release_lockspace) */ + + error = dlm_comm_seq(memb->nodeid, &seq); + + if (!error && seq == memb->comm_seq) + return; + + slot.nodeid = memb->nodeid; + slot.slot = memb->slot; + + ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot); +} + +void dlm_lsop_recover_done(struct dlm_ls *ls) +{ + struct dlm_member *memb; + struct dlm_slot *slots; + int i, num; + + if (!ls->ls_ops || !ls->ls_ops->recover_done) + return; + + num = ls->ls_num_nodes; + + slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + if (!slots) + return; + + i = 0; + list_for_each_entry(memb, &ls->ls_nodes, list) { + if (i == num) { + log_error(ls, "dlm_lsop_recover_done bad num %d", num); + goto out; + } + slots[i].nodeid = memb->nodeid; + slots[i].slot = memb->slot; + i++; + } + + ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num, + ls->ls_slot, ls->ls_generation); + out: + kfree(slots); +} + +static struct dlm_config_node *find_config_node(struct dlm_recover *rv, + int nodeid) +{ + int i; + + for (i = 0; i < rv->nodes_count; i++) { + if (rv->nodes[i].nodeid == nodeid) + return &rv->nodes[i]; + } + return NULL; +} + int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) { struct dlm_member *memb, *safe; - int i, error, found, pos = 0, neg = 0, low = -1; + struct dlm_config_node *node; + int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to count as a negative change so the "neg" recovery steps will happen */ @@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) /* move departed members from ls_nodes to ls_nodes_gone */ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { - found = 0; - for (i = 0; i < rv->node_count; i++) { - if (memb->nodeid == rv->nodeids[i]) { - found = 1; - break; - } - } + node = find_config_node(rv, memb->nodeid); + if (node && !node->new) + continue; - if (!found) { - neg++; - dlm_remove_member(ls, memb); + if (!node) { log_debug(ls, "remove member %d", memb->nodeid); + } else { + /* removed and re-added */ + log_debug(ls, "remove member %d comm_seq %u %u", + memb->nodeid, memb->comm_seq, node->comm_seq); } - } - - /* Add an entry to ls_nodes_gone for members that were removed and - then added again, so that previous state for these nodes will be - cleared during recovery. */ - - for (i = 0; i < rv->new_count; i++) { - if (!dlm_is_member(ls, rv->new[i])) - continue; - log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); - memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); - if (!memb) - return -ENOMEM; - memb->nodeid = rv->new[i]; - list_add_tail(&memb->list, &ls->ls_nodes_gone); neg++; + list_move(&memb->list, &ls->ls_nodes_gone); + ls->ls_num_nodes--; + dlm_lsop_recover_slot(ls, memb); } /* add new members to ls_nodes */ - for (i = 0; i < rv->node_count; i++) { - if (dlm_is_member(ls, rv->nodeids[i])) + for (i = 0; i < rv->nodes_count; i++) { + node = &rv->nodes[i]; + if (dlm_is_member(ls, node->nodeid)) continue; - dlm_add_member(ls, rv->nodeids[i]); - pos++; - log_debug(ls, "add member %d", rv->nodeids[i]); + dlm_add_member(ls, node); + log_debug(ls, "add member %d", node->nodeid); } list_for_each_entry(memb, &ls->ls_nodes, list) { @@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) ls->ls_low_nodeid = low; make_member_array(ls); - dlm_set_recover_status(ls, DLM_RS_NODES); *neg_out = neg; error = ping_members(ls); @@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) ls->ls_members_result = error; complete(&ls->ls_members_done); } - if (error) - goto out; - error = dlm_recover_members_wait(ls); - out: - log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error); + log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls) */ dlm_recoverd_suspend(ls); + + spin_lock(&ls->ls_recover_lock); + kfree(ls->ls_slots); + ls->ls_slots = NULL; + ls->ls_num_slots = 0; + ls->ls_slots_size = 0; ls->ls_recover_status = 0; + spin_unlock(&ls->ls_recover_lock); + dlm_recoverd_resume(ls); if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; + + dlm_lsop_recover_prep(ls); return 0; } int dlm_ls_start(struct dlm_ls *ls) { struct dlm_recover *rv = NULL, *rv_old; - int *ids = NULL, *new = NULL; - int error, ids_count = 0, new_count = 0; + struct dlm_config_node *nodes; + int error, count; rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); if (!rv) return -ENOMEM; - error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count, - &new, &new_count); + error = dlm_config_nodes(ls->ls_name, &nodes, &count); if (error < 0) goto fail; @@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls) goto fail; } - rv->nodeids = ids; - rv->node_count = ids_count; - rv->new = new; - rv->new_count = new_count; + rv->nodes = nodes; + rv->nodes_count = count; rv->seq = ++ls->ls_recover_seq; rv_old = ls->ls_recover_args; ls->ls_recover_args = rv; @@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls) if (rv_old) { log_error(ls, "unused recovery %llx %d", - (unsigned long long)rv_old->seq, rv_old->node_count); - kfree(rv_old->nodeids); - kfree(rv_old->new); + (unsigned long long)rv_old->seq, rv_old->nodes_count); + kfree(rv_old->nodes); kfree(rv_old); } @@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls) fail: kfree(rv); - kfree(ids); - kfree(new); + kfree(nodes); return error; } diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 7a26fca1e0b..3deb70661c6 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); int dlm_is_member(struct dlm_ls *ls, int nodeid); +int dlm_slots_version(struct dlm_header *h); +void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, + struct dlm_member *memb); +void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_slots_copy_in(struct dlm_ls *ls); +int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, + struct dlm_slot **slots_out, uint32_t *gen_out); +void dlm_lsop_recover_done(struct dlm_ls *ls); #endif /* __MEMBER_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f10a50f24e8..ac5c616c969 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -23,6 +23,7 @@ #include "memory.h" #include "lock.h" #include "util.h" +#include "member.h" static int rcom_response(struct dlm_ls *ls) @@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, dlm_lowcomms_commit_buffer(mh); } +static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, + uint32_t flags) +{ + rs->rs_flags = cpu_to_le32(flags); +} + /* When replying to a status request, a node also sends back its configuration values. The requesting node then checks that the remote node is configured the same way as itself. */ -static void make_config(struct dlm_ls *ls, struct rcom_config *rf) +static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf, + uint32_t num_slots) { rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); + + rf->rf_our_slot = cpu_to_le16(ls->ls_slot); + rf->rf_num_slots = cpu_to_le16(num_slots); + rf->rf_generation = cpu_to_le32(ls->ls_generation); } -static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; - size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { log_error(ls, "version mismatch: %x nodeid %d: %x", @@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) return -EPROTO; } - if (rc->rc_header.h_length < conf_size) { - log_error(ls, "config too short: %d nodeid %d", - rc->rc_header.h_length, nodeid); - return -EPROTO; - } - if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", @@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls) spin_unlock(&ls->ls_rcom_spin); } -int dlm_rcom_status(struct dlm_ls *ls, int nodeid) +/* + * low nodeid gathers one slot value at a time from each node. + * it sets need_slots=0, and saves rf_our_slot returned from each + * rcom_config. + * + * other nodes gather all slot values at once from the low nodeid. + * they set need_slots=1, and ignore the rf_our_slot returned from each + * rcom_config. they use the rf_num_slots returned from the low + * node's rcom_config. + */ + +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) goto out; } - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &mh); if (error) goto out; + set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); + allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); @@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid) /* we pretend the remote lockspace exists with 0 status */ log_debug(ls, "remote node %d not ready", nodeid); rc->rc_result = 0; - } else - error = check_config(ls, rc, nodeid); + error = 0; + } else { + error = check_rcom_config(ls, rc, nodeid); + } + /* the caller looks at rc_result for the remote recovery status */ out: return error; @@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct dlm_mhandle *mh; - int error, nodeid = rc_in->rc_header.h_nodeid; + struct rcom_status *rs; + uint32_t status; + int nodeid = rc_in->rc_header.h_nodeid; + int len = sizeof(struct rcom_config); + int num_slots = 0; + int error; + + if (!dlm_slots_version(&rc_in->rc_header)) { + status = dlm_recover_status(ls); + goto do_create; + } + + rs = (struct rcom_status *)rc_in->rc_buf; + if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) { + status = dlm_recover_status(ls); + goto do_create; + } + + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + num_slots = ls->ls_num_slots; + spin_unlock(&ls->ls_recover_lock); + len += num_slots * sizeof(struct rcom_slot); + + do_create: error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, - sizeof(struct rcom_config), &rc, &mh); + len, &rc, &mh); if (error) return; + rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - rc->rc_result = dlm_recover_status(ls); - make_config(ls, (struct rcom_config *) rc->rc_buf); + rc->rc_result = status; + + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots); + + if (!num_slots) + goto do_send; + + spin_lock(&ls->ls_recover_lock); + if (ls->ls_num_slots != num_slots) { + spin_unlock(&ls->ls_recover_lock); + log_debug(ls, "receive_rcom_status num_slots %d to %d", + num_slots, ls->ls_num_slots); + rc->rc_result = 0; + set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0); + goto do_send; + } + + dlm_slots_copy_out(ls, rc); + spin_unlock(&ls->ls_recover_lock); + do_send: send_rcom(ls, mh, rc); } diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index b09abd29ba3..206723ab744 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -14,7 +14,7 @@ #ifndef __RCOM_DOT_H__ #define __RCOM_DOT_H__ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid); +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 14638235f7b..34d5adf1fce 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls) return status; } +static void _set_recover_status(struct dlm_ls *ls, uint32_t status) +{ + ls->ls_recover_status |= status; +} + void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) { spin_lock(&ls->ls_recover_lock); - ls->ls_recover_status |= status; + _set_recover_status(ls, status); spin_unlock(&ls->ls_recover_lock); } -static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) +static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, + int save_slots) { struct dlm_rcom *rc = ls->ls_recover_buf; struct dlm_member *memb; @@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) goto out; } - error = dlm_rcom_status(ls, memb->nodeid); + error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) goto out; + if (save_slots) + dlm_slot_save(ls, rc, memb); + if (rc->rc_result & wait_status) break; if (delay < 1000) @@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) return error; } -static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) +static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, + uint32_t status_flags) { struct dlm_rcom *rc = ls->ls_recover_buf; int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; @@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) goto out; } - error = dlm_rcom_status(ls, nodeid); + error = dlm_rcom_status(ls, nodeid, status_flags); if (error) break; @@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status) int error; if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, status); + error = wait_status_all(ls, status, 0); if (!error) dlm_set_recover_status(ls, status_all); } else - error = wait_status_low(ls, status_all); + error = wait_status_low(ls, status_all, 0); return error; } int dlm_recover_members_wait(struct dlm_ls *ls) { - return wait_status(ls, DLM_RS_NODES); + struct dlm_member *memb; + struct dlm_slot *slots; + int num_slots, slots_size; + int error, rv; + uint32_t gen; + + list_for_each_entry(memb, &ls->ls_nodes, list) { + memb->slot = -1; + memb->generation = 0; + } + + if (ls->ls_low_nodeid == dlm_our_nodeid()) { + error = wait_status_all(ls, DLM_RS_NODES, 1); + if (error) + goto out; + + /* slots array is sparse, slots_size may be > num_slots */ + + rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen); + if (!rv) { + spin_lock(&ls->ls_recover_lock); + _set_recover_status(ls, DLM_RS_NODES_ALL); + ls->ls_num_slots = num_slots; + ls->ls_slots_size = slots_size; + ls->ls_slots = slots; + ls->ls_generation = gen; + spin_unlock(&ls->ls_recover_lock); + } else { + dlm_set_recover_status(ls, DLM_RS_NODES_ALL); + } + } else { + error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + if (error) + goto out; + + dlm_slots_copy_in(ls); + } + out: + return error; } int dlm_recover_directory_wait(struct dlm_ls *ls) @@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls) out: if (error) recover_list_clear(ls); - else - dlm_set_recover_status(ls, DLM_RS_LOCKS); return error; } @@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) int dlm_create_root_list(struct dlm_ls *ls) { + struct rb_node *n; struct dlm_rsb *r; int i, error = 0; @@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls) for (i = 0; i < ls->ls_rsbtbl_size; i++) { spin_lock(&ls->ls_rsbtbl[i].lock); - list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } @@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls) continue; } - list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } @@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls) void dlm_clear_toss_list(struct dlm_ls *ls) { - struct dlm_rsb *r, *safe; + struct rb_node *n, *next; + struct dlm_rsb *rsb; int i; for (i = 0; i < ls->ls_rsbtbl_size; i++) { spin_lock(&ls->ls_rsbtbl[i].lock); - list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, - res_hashchain) { - if (dlm_no_directory(ls) || !is_master(r)) { - list_del(&r->res_hashchain); - dlm_free_rsb(r); + for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) { + next = rb_next(n);; + rsb = rb_entry(n, struct dlm_rsb, res_hashnode); + if (dlm_no_directory(ls) || !is_master(rsb)) { + rb_erase(n, &ls->ls_rsbtbl[i].toss); + dlm_free_rsb(rsb); } } spin_unlock(&ls->ls_rsbtbl[i].lock); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 774da3cf92c..3780caf7ae0 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "recover %llx", (unsigned long long)rv->seq); + log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. - * Also waits for all nodes to complete dlm_recover_members. */ error = dlm_recover_members(ls, rv, &neg); if (error) { - log_debug(ls, "recover_members failed %d", error); + log_debug(ls, "dlm_recover_members error %d", error); goto fail; } + + dlm_set_recover_status(ls, DLM_RS_NODES); + + error = dlm_recover_members_wait(ls); + if (error) { + log_debug(ls, "dlm_recover_members_wait error %d", error); + goto fail; + } + start = jiffies; /* @@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_debug(ls, "recover_directory failed %d", error); + log_debug(ls, "dlm_recover_directory error %d", error); goto fail; } - /* - * Wait for all nodes to complete directory rebuild. - */ + dlm_set_recover_status(ls, DLM_RS_DIR); error = dlm_recover_directory_wait(ls); if (error) { - log_debug(ls, "recover_directory_wait failed %d", error); + log_debug(ls, "dlm_recover_directory_wait error %d", error); goto fail; } @@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_debug(ls, "recover_masters failed %d", error); + log_debug(ls, "dlm_recover_masters error %d", error); goto fail; } @@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_debug(ls, "recover_locks failed %d", error); + log_debug(ls, "dlm_recover_locks error %d", error); goto fail; } + dlm_set_recover_status(ls, DLM_RS_LOCKS); + error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "dlm_recover_locks_wait error %d", error); goto fail; } @@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "recover_locks_wait failed %d", error); + log_debug(ls, "dlm_recover_locks_wait error %d", error); goto fail; } } @@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_purge_requestqueue(ls); dlm_set_recover_status(ls, DLM_RS_DONE); + error = dlm_recover_done_wait(ls); if (error) { - log_debug(ls, "recover_done_wait failed %d", error); + log_debug(ls, "dlm_recover_done_wait error %d", error); goto fail; } @@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_debug(ls, "enable_locking failed %d", error); + log_debug(ls, "enable_locking error %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_debug(ls, "process_requestqueue failed %d", error); + log_debug(ls, "dlm_process_requestqueue error %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_debug(ls, "recover_waiters_post failed %d", error); + log_debug(ls, "dlm_recover_waiters_post error %d", error); goto fail; } dlm_grant_after_purge(ls); - log_debug(ls, "recover %llx done: %u ms", - (unsigned long long)rv->seq, + log_debug(ls, "dlm_recover %llx generation %u done: %u ms", + (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); + dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_debug(ls, "recover %llx error %d", + log_debug(ls, "dlm_recover %llx error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; @@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls) if (rv) { ls_recover(ls, rv); - kfree(rv->nodeids); - kfree(rv->new); + kfree(rv->nodes); kfree(rv); } } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d8ea6075640..eb4ed9ba309 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, strlen(params->name), - &lockspace, params->flags, DLM_USER_LVB_LEN); + error = dlm_new_lockspace(params->name, NULL, params->flags, + DLM_USER_LVB_LEN, NULL, NULL, NULL, + &lockspace); if (error) return error; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2a834255c75..ea993128155 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, (unsigned long long)(extent_base + extent_offset), rc); goto out; } - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypting extent " - "with iv:\n"); - ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " - "encryption:\n"); - ecryptfs_dump_hex((char *) - (page_address(page) - + (extent_offset * crypt_stat->extent_size)), - 8); - } rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, page, (extent_offset * crypt_stat->extent_size), @@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, goto out; } rc = 0; - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; " - "rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " - "encryption:\n"); - ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); - } out: return rc; } @@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page, (unsigned long long)(extent_base + extent_offset), rc); goto out; } - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypting extent " - "with iv:\n"); - ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " - "decryption:\n"); - ecryptfs_dump_hex((char *) - (page_address(enc_extent_page) - + (extent_offset * crypt_stat->extent_size)), - 8); - } rc = ecryptfs_decrypt_page_offset(crypt_stat, page, (extent_offset * crypt_stat->extent_size), @@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page, goto out; } rc = 0; - if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; " - "rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " - "decryption:\n"); - ecryptfs_dump_hex((char *)(page_address(page) - + (extent_offset - * crypt_stat->extent_size)), 8); - } out: return rc; } @@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, */ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) { - int rc = 0; - char *page_virt = NULL; + int rc; + char *page_virt; struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; @@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ecryptfs_dentry, ECRYPTFS_VALIDATE_HEADER_SIZE); if (rc) { + /* metadata is not in the file header, so try xattrs */ memset(page_virt, 0, PAGE_CACHE_SIZE); rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file header region or xattr region\n"); + "file header region or xattr region, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; goto out; } @@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file xattr region either\n"); + "file xattr region either, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; } if (crypt_stat->mount_crypt_stat->flags @@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) "crypto metadata only in the extended attribute " "region, but eCryptfs was mounted without " "xattr support enabled. eCryptfs will not treat " - "this like an encrypted file.\n"); + "this like an encrypted file, inode %lu\n", + ecryptfs_inode->i_ino); rc = -EINVAL; } } @@ -2026,6 +1990,17 @@ out: return; } +static size_t ecryptfs_max_decoded_size(size_t encoded_size) +{ + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + return ((encoded_size + 1) * 3) / 4; +} + /** * ecryptfs_decode_from_filename * @dst: If NULL, this function only sets @dst_size and returns. If @@ -2044,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, size_t dst_byte_offset = 0; if (dst == NULL) { - /* Not exact; conservatively long. Every block of 4 - * encoded characters decodes into a block of 3 - * decoded characters. This segment of code provides - * the caller with the maximum amount of allocated - * space that @dst will need to point to in a - * subsequent call. */ - (*dst_size) = (((src_size + 1) * 3) / 4); + (*dst_size) = ecryptfs_max_decoded_size(src_size); goto out; } while (src_byte_offset < src_size) { @@ -2275,3 +2244,52 @@ out_free: out: return rc; } + +#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143 + +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct blkcipher_desc desc; + struct mutex *tfm_mutex; + size_t cipher_blocksize; + int rc; + + if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + (*namelen) = lower_namelen; + return 0; + } + + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + (*namelen) = 0; + return rc; + } + + mutex_lock(tfm_mutex); + cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); + mutex_unlock(tfm_mutex); + + /* Return an exact amount for the common cases */ + if (lower_namelen == NAME_MAX + && (cipher_blocksize == 8 || cipher_blocksize == 16)) { + (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16; + return 0; + } + + /* Return a safe estimate for the uncommon cases */ + (*namelen) = lower_namelen; + (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + /* Since this is the max decoded size, subtract 1 "decoded block" len */ + (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3; + (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE; + (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES; + /* Worst case is that the filename is padded nearly a full block size */ + (*namelen) -= cipher_blocksize - 1; + + if ((*namelen) < 0) + (*namelen) = 0; + + return 0; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a9f29b12fbf..867b64c5d84 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -151,12 +151,21 @@ ecryptfs_get_key_payload_data(struct key *key) * dentry name */ #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as * metadata */ +#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */ +#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to + * ecryptfs_parse_packet_length() and + * ecryptfs_write_packet_length() + */ /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define MD5_DIGEST_SIZE 16 #define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) +#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." @@ -696,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, size_t *packet_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *data, size_t max_packet_size); +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 32f90a3ae63..ab35b113003 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -144,24 +144,6 @@ static int ecryptfs_interpose(struct dentry *lower_dentry, } /** - * ecryptfs_create_underlying_file - * @lower_dir_inode: inode of the parent in the lower fs of the new file - * @dentry: New file's dentry - * @mode: The mode of the new file - * - * Creates the file in the lower file system. - * - * Returns zero on success; non-zero on error condition - */ -static int -ecryptfs_create_underlying_file(struct inode *lower_dir_inode, - struct dentry *dentry, int mode) -{ - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - return vfs_create(lower_dir_inode, lower_dentry, mode, NULL); -} - -/** * ecryptfs_do_create * @directory_inode: inode of the new file's dentry's parent in ecryptfs * @ecryptfs_dentry: New file's dentry in ecryptfs @@ -176,7 +158,7 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, */ static struct inode * ecryptfs_do_create(struct inode *directory_inode, - struct dentry *ecryptfs_dentry, int mode) + struct dentry *ecryptfs_dentry, umode_t mode) { int rc; struct dentry *lower_dentry; @@ -191,8 +173,7 @@ ecryptfs_do_create(struct inode *directory_inode, inode = ERR_CAST(lower_dir_dentry); goto out; } - rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, - ecryptfs_dentry, mode); + rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, NULL); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -267,7 +248,7 @@ out: */ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { struct inode *ecryptfs_inode; int rc; @@ -559,7 +540,7 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; @@ -607,7 +588,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int rc; struct dentry *lower_dentry; @@ -841,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, size_t num_zeros = (PAGE_CACHE_SIZE - (ia->ia_size & ~PAGE_CACHE_MASK)); - - /* - * XXX(truncate) this should really happen at the begginning - * of ->setattr. But the code is too messy to that as part - * of a larger patch. ecryptfs is also totally missing out - * on the inode_change_ok check at the beginning of - * ->setattr while would include this. - */ - rc = inode_newsize_ok(inode, ia->ia_size); - if (rc) - goto out; - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { truncate_setsize(inode, ia->ia_size); lower_ia->ia_size = ia->ia_size; @@ -902,6 +871,28 @@ out: return rc; } +static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset) +{ + struct ecryptfs_crypt_stat *crypt_stat; + loff_t lower_oldsize, lower_newsize; + + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + lower_oldsize = upper_size_to_lower_size(crypt_stat, + i_size_read(inode)); + lower_newsize = upper_size_to_lower_size(crypt_stat, offset); + if (lower_newsize > lower_oldsize) { + /* + * The eCryptfs inode and the new *lower* size are mixed here + * because we may not have the lower i_mutex held and/or it may + * not be appropriate to call inode_newsize_ok() with inodes + * from other filesystems. + */ + return inode_newsize_ok(inode, lower_newsize); + } + + return 0; +} + /** * ecryptfs_truncate * @dentry: The ecryptfs layer dentry @@ -918,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct iattr lower_ia = { .ia_valid = 0 }; int rc; + rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + if (rc) + return rc; + rc = truncate_upper(dentry, &ia, &lower_ia); if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); @@ -997,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } } mutex_unlock(&crypt_stat->cs_mutex); + + rc = inode_change_ok(inode, ia); + if (rc) + goto out; + if (ia->ia_valid & ATTR_SIZE) { + rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size); + if (rc) + goto out; + } + if (S_ISREG(inode->i_mode)) { rc = filemap_write_and_wait(inode->i_mapping); if (rc) @@ -1080,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); + if (!rc) + fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ac1ad48c237..2333203a120 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, (*size) += ((unsigned char)(data[1]) + 192); (*length_size) = 2; } else if (data[0] == 255) { - /* Five-byte length; we're not supposed to see this */ + /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ ecryptfs_printk(KERN_ERR, "Five-byte packet length not " "supported\n"); rc = -EINVAL; @@ -126,7 +126,7 @@ out: /** * ecryptfs_write_packet_length * @dest: The byte array target into which to write the length. Must - * have at least 5 bytes allocated. + * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated. * @size: The length to write. * @packet_size_length: The number of bytes used to encode the packet * length is written to this address. @@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size, dest[1] = ((size - 192) % 256); (*packet_size_length) = 2; } else { + /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Unsupported packet size: [%zd]\n", size); @@ -678,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, * Octets N3-N4: Block-aligned encrypted filename * - Consists of a minimum number of random characters, a \0 * separator, and then the filename */ - s->max_packet_size = (1 /* Tag 70 identifier */ - + 3 /* Max Tag 70 packet size */ - + ECRYPTFS_SIG_SIZE /* FNEK sig */ - + 1 /* Cipher identifier */ + s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE + s->block_aligned_filename_size); if (dest == NULL) { (*packet_size) = s->max_packet_size; @@ -933,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " "at least [%d]\n", __func__, max_packet_size, - (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + ECRYPTFS_TAG_70_MIN_METADATA_SIZE); rc = -EINVAL; goto out; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 940a82e63dc..3a06f4043df 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -218,6 +218,29 @@ out_unlock: return rc; } +/* + * miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Octets 5-N1 not written if the packet type does not include a message + */ +#define PKT_TYPE_SIZE 1 +#define PKT_CTR_SIZE 4 +#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE) +#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \ + + ECRYPTFS_MIN_PKT_LEN_SIZE) +/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */ +#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \ + + ECRYPTFS_MAX_PKT_LEN_SIZE \ + + sizeof(struct ecryptfs_message) \ + + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) +#define PKT_TYPE_OFFSET 0 +#define PKT_CTR_OFFSET PKT_TYPE_SIZE +#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE) + /** * ecryptfs_miscdev_read - format and send message from queue * @file: fs/ecryptfs/euid miscdevfs handle (ignored) @@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, struct ecryptfs_daemon *daemon; struct ecryptfs_msg_ctx *msg_ctx; size_t packet_length_size; - char packet_length[3]; + char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE]; size_t i; size_t total_length; uid_t euid = current_euid(); @@ -305,15 +328,8 @@ check_list: packet_length_size = 0; msg_ctx->msg_size = 0; } - /* miscdevfs packet format: - * Octet 0: Type - * Octets 1-4: network byte order msg_ctx->counter - * Octets 5-N0: Size of struct ecryptfs_message to follow - * Octets N0-N1: struct ecryptfs_message (including data) - * - * Octets 5-N1 not written if the packet type does not - * include a message */ - total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size); + total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size + + msg_ctx->msg_size); if (count < total_length) { rc = 0; printk(KERN_WARNING "%s: Only given user buffer of " @@ -324,9 +340,10 @@ check_list: rc = -EFAULT; if (put_user(msg_ctx->type, buf)) goto out_unlock_msg_ctx; - if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1))) + if (put_user(cpu_to_be32(msg_ctx->counter), + (__be32 __user *)(&buf[PKT_CTR_OFFSET]))) goto out_unlock_msg_ctx; - i = 5; + i = PKT_TYPE_SIZE + PKT_CTR_SIZE; if (msg_ctx->msg) { if (copy_to_user(&buf[i], packet_length, packet_length_size)) goto out_unlock_msg_ctx; @@ -391,12 +408,6 @@ out: * @count: Amount of data in @buf * @ppos: Pointer to offset in file (ignored) * - * miscdevfs packet format: - * Octet 0: Type - * Octets 1-4: network byte order msg_ctx->counter (0's for non-response) - * Octets 5-N0: Size of struct ecryptfs_message to follow - * Octets N0-N1: struct ecryptfs_message (including data) - * * Returns the number of bytes read from @buf */ static ssize_t @@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, { __be32 counter_nbo; u32 seq; - size_t packet_size, packet_size_length, i; - ssize_t sz = 0; + size_t packet_size, packet_size_length; char *data; uid_t euid = current_euid(); - int rc; + unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE]; + ssize_t rc; - if (count == 0) - goto out; + if (count == 0) { + return 0; + } else if (count == MIN_NON_MSG_PKT_SIZE) { + /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */ + goto memdup; + } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { + printk(KERN_WARNING "%s: Acceptable packet size range is " + "[%d-%zu], but amount of data written is [%zu].", + __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); + return -EINVAL; + } + + if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET], + sizeof(packet_size_peek))) { + printk(KERN_WARNING "%s: Error while inspecting packet size\n", + __func__); + return -EFAULT; + } + rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size, + &packet_size_length); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%zd]\n", __func__, rc); + return rc; + } + + if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size) + != count) { + printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__, + packet_size); + return -EINVAL; + } + +memdup: data = memdup_user(buf, count); if (IS_ERR(data)) { printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", __func__, PTR_ERR(data)); - goto out; + return PTR_ERR(data); } - sz = count; - i = 0; - switch (data[i++]) { + switch (data[PKT_TYPE_OFFSET]) { case ECRYPTFS_MSG_RESPONSE: - if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { + if (count < (MIN_MSG_PKT_SIZE + + sizeof(struct ecryptfs_message))) { printk(KERN_WARNING "%s: Minimum acceptable packet " "size is [%zd], but amount of data written is " "only [%zd]. Discarding response packet.\n", __func__, - (1 + 4 + 1 + sizeof(struct ecryptfs_message)), - count); + (MIN_MSG_PKT_SIZE + + sizeof(struct ecryptfs_message)), count); + rc = -EINVAL; goto out_free; } - memcpy(&counter_nbo, &data[i], 4); + memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE); seq = be32_to_cpu(counter_nbo); - i += 4; - rc = ecryptfs_parse_packet_length(&data[i], &packet_size, - &packet_size_length); + rc = ecryptfs_miscdev_response( + &data[PKT_LEN_OFFSET + packet_size_length], + packet_size, euid, current_user_ns(), + task_pid(current), seq); if (rc) { - printk(KERN_WARNING "%s: Error parsing packet length; " - "rc = [%d]\n", __func__, rc); - goto out_free; - } - i += packet_size_length; - if ((1 + 4 + packet_size_length + packet_size) != count) { - printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" - " + packet_size([%zd]))([%zd]) != " - "count([%zd]). Invalid packet format.\n", - __func__, packet_size_length, packet_size, - (1 + packet_size_length + packet_size), count); - goto out_free; - } - rc = ecryptfs_miscdev_response(&data[i], packet_size, - euid, current_user_ns(), - task_pid(current), seq); - if (rc) printk(KERN_WARNING "%s: Failed to deliver miscdev " - "response to requesting operation; rc = [%d]\n", + "response to requesting operation; rc = [%zd]\n", __func__, rc); + goto out_free; + } break; case ECRYPTFS_MSG_HELO: case ECRYPTFS_MSG_QUIT: @@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, ecryptfs_printk(KERN_WARNING, "Dropping miscdev " "message of unrecognized type [%d]\n", data[0]); - break; + rc = -EINVAL; + goto out_free; } + rc = count; out_free: kfree(data); -out: - return sz; + return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 6a44148c5fb..a46b3a8fee1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) * @page: Page that is locked before this call is made * * Returns zero on success; non-zero otherwise + * + * This is where we encrypt the data and pass the encrypted data to + * the lower filesystem. In OpenPGP-compatible mode, we operate on + * entire underlying packets. */ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) { @@ -146,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page, KM_USER0); + page_virt = kmap_atomic(page); memset(page_virt, 0, PAGE_CACHE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -159,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt, KM_USER0); + kunmap_atomic(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " @@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @copied: The amount of data copied * @page: The eCryptfs page * @fsdata: The fsdata (unused) - * - * This is where we encrypt the data and pass the encrypted data to - * the lower filesystem. In OpenPGP-compatible mode, we operate on - * entire underlying packets. */ static int ecryptfs_write_end(struct file *file, struct address_space *mapping, diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 3745f7c2b9c..b2a34a192f4 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); - size_t total_remaining_bytes = ((offset + size) - pos); + loff_t total_remaining_bytes = ((offset + size) - pos); + + if (fatal_signal_pending(current)) { + rc = -EINTR; + break; + } if (num_bytes > total_remaining_bytes) num_bytes = total_remaining_bytes; if (pos < offset) { /* remaining zeros to write, up to destination offset */ - size_t total_remaining_zeros = (offset - pos); + loff_t total_remaining_zeros = (offset - pos); if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; @@ -151,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); + ecryptfs_page_virt = kmap_atomic(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -174,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt, KM_USER0); + kunmap_atomic(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); @@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, } pos += num_bytes; } - if ((offset + size) > ecryptfs_file_size) { - i_size_write(ecryptfs_inode, (offset + size)); + if (pos > ecryptfs_file_size) { + i_size_write(ecryptfs_inode, pos); if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { - rc = ecryptfs_write_inode_size_to_metadata( + int rc2; + + rc2 = ecryptfs_write_inode_size_to_metadata( ecryptfs_inode); - if (rc) { + if (rc2) { printk(KERN_ERR "Problem with " "ecryptfs_write_inode_size_to_metadata; " - "rc = [%d]\n", rc); + "rc = [%d]\n", rc2); + if (!rc) + rc = rc2; goto out; } } @@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, flush_dcache_page(page_for_ecryptfs); return rc; } - -#if 0 -/** - * ecryptfs_read - * @data: The virtual address into which to write the data read (and - * possibly decrypted) from the lower file - * @offset: The offset in the decrypted view of the file from which to - * read into @data - * @size: The number of bytes to read into @data - * @ecryptfs_file: The eCryptfs file from which to read - * - * Read an arbitrary amount of data from an arbitrary location in the - * eCryptfs page cache. This is done on an extent-by-extent basis; - * individual extents are decrypted and read from the lower page - * cache (via VFS reads). This function takes care of all the - * address translation to locations in the lower filesystem. - * - * Returns zero on success; non-zero otherwise - */ -int ecryptfs_read(char *data, loff_t offset, size_t size, - struct file *ecryptfs_file) -{ - struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; - struct page *ecryptfs_page; - char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); - loff_t data_offset = 0; - loff_t pos; - int rc = 0; - - if ((offset + size) > ecryptfs_file_size) { - rc = -EINVAL; - printk(KERN_ERR "%s: Attempt to read data past the end of the " - "file; offset = [%lld]; size = [%td]; " - "ecryptfs_file_size = [%lld]\n", - __func__, offset, size, ecryptfs_file_size); - goto out; - } - pos = offset; - while (pos < (offset + size)) { - pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); - size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); - size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); - size_t total_remaining_bytes = ((offset + size) - pos); - - if (num_bytes > total_remaining_bytes) - num_bytes = total_remaining_bytes; - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, - ecryptfs_page_idx); - if (IS_ERR(ecryptfs_page)) { - rc = PTR_ERR(ecryptfs_page); - printk(KERN_ERR "%s: Error getting page at " - "index [%ld] from eCryptfs inode " - "mapping; rc = [%d]\n", __func__, - ecryptfs_page_idx, rc); - goto out; - } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); - memcpy((data + data_offset), - ((char *)ecryptfs_page_virt + start_offset_in_page), - num_bytes); - kunmap_atomic(ecryptfs_page_virt, KM_USER0); - flush_dcache_page(ecryptfs_page); - SetPageUptodate(ecryptfs_page); - unlock_page(ecryptfs_page); - page_cache_release(ecryptfs_page); - pos += num_bytes; - data_offset += num_bytes; - } -out: - return rc; -} -#endif /* 0 */ diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index dbd52d40df4..cf152823bbf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -30,6 +30,8 @@ #include <linux/seq_file.h> #include <linux/file.h> #include <linux/crypto.h> +#include <linux/statfs.h> +#include <linux/magic.h> #include "ecryptfs_kernel.h" struct kmem_cache *ecryptfs_inode_info_cache; @@ -69,7 +71,6 @@ static void ecryptfs_i_callback(struct rcu_head *head) struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ecryptfs_inode_info_cache, inode_info); } @@ -103,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode) static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + int rc; if (!lower_dentry->d_sb->s_op->statfs) return -ENOSYS; - return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + + rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + if (rc) + return rc; + + buf->f_type = ECRYPTFS_SUPER_MAGIC; + rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen, + &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat); + + return rc; } /** @@ -132,9 +143,9 @@ static void ecryptfs_evict_inode(struct inode *inode) * Prints the mount options for a given superblock. * Returns zero; does not fail. */ -static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ecryptfs_show_options(struct seq_file *m, struct dentry *root) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = root->d_sb; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; struct ecryptfs_global_auth_tok *walker; diff --git a/fs/efs/super.c b/fs/efs/super.c index 0f31acb0131..981106429a9 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -68,7 +68,6 @@ static struct inode *efs_alloc_inode(struct super_block *sb) static void efs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23..4d9d3a45e35 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -299,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { @@ -446,6 +472,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from @@ -460,7 +498,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) pwq = list_first_entry(lsthead, struct eppoll_entry, llink); list_del(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); + ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } @@ -711,12 +749,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -827,6 +859,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ + list_del_init(&wait->task_list); + } + spin_lock_irqsave(&ep->lock, flags); /* @@ -926,6 +969,103 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + /* Allow an arbitrary number of depth 1 paths */ + if (nests == 0) + return 0; + + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1127,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1156,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1428,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1478,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1510,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1529,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1613,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1652,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1671,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/exec.c b/fs/exec.c index 36254645b7c..153dee14fe5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> + +#include <trace/events/task.h> #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. @@ -1067,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } +static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) +{ + int i, ch; + + /* Copies the binary name from after last slash */ + for (i = 0; (ch = *(fn++)) != '\0';) { + if (ch == '/') + i = 0; /* overwrite what we wrote */ + else + if (i < len - 1) + tcomm[i++] = ch; + } + tcomm[i] = '\0'; +} + int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1081,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm) set_mm_exe_file(bprm->mm, bprm->file); + filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1112,10 +1132,6 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { - int i, ch; - const char *name; - char tcomm[sizeof(current->comm)]; - arch_pick_mmap_layout(current->mm); /* This is the point of no return */ @@ -1126,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - name = bprm->filename; - - /* Copies the binary name from after last slash */ - for (i=0; (ch = *(name++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < (sizeof(tcomm) - 1)) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; - set_task_comm(current, tcomm); + set_task_comm(current, bprm->tcomm); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on @@ -1225,7 +1230,7 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -int check_unsafe_exec(struct linux_binprm *bprm) +static int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; @@ -1910,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion *vfork_done; int core_waiters = -EBUSY; init_completion(&core_state->startup); @@ -1922,22 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (unlikely(core_waiters < 0)) - goto fail; - - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - vfork_done = tsk->vfork_done; - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } - - if (core_waiters) + if (core_waiters > 0) wait_for_completion(&core_state->startup); -fail: + return core_waiters; } diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index da42f32c49b..86194b2f799 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,14 +1,3 @@ -# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects -# for every ORE user we do it like this. Any user should add itself here -# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are -# selected here, and we default to "ON". So in effect it is like been -# selected by any of the users. -config ORE - tristate - depends on EXOFS_FS || PNFS_OBJLAYOUT - select ASYNC_XOR - default SCSI_OSD_ULD - config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore new file mode 100644 index 00000000000..1ca7fb7b6ba --- /dev/null +++ b/fs/exofs/Kconfig.ore @@ -0,0 +1,12 @@ +# ORE - Objects Raid Engine (libore.ko) +# +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + default SCSI_OSD_ULD diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d0941c6a1f7..80405836ba6 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -234,7 +234,7 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 51f4b4c40f0..ca9d49665ef 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -154,7 +154,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, int); +struct inode *exofs_new_inode(struct inode *, umode_t); extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_evict_inode(struct inode *); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f6dbf7768ce..ea5e1f97806 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1276,7 +1276,7 @@ static void create_done(struct ore_io_state *ios, void *p) /* * Set up a new inode and create an object for it on the OSD */ -struct inode *exofs_new_inode(struct inode *dir, int mode) +struct inode *exofs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index b54c43775f1..9dbf0c30103 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -59,7 +59,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode = exofs_new_inode(dir, mode); @@ -74,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, return err; } -static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, return exofs_add_nondir(dentry, inode); } -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int err = -EMLINK; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index d271ad83720..49cf230554a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -266,7 +266,7 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, /* first/last seg is split */ num_raid_units += layout->group_width; - sgs_per_dev = div_u64(num_raid_units, data_devs); + sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; } else { /* For Writes add parity pages array. */ max_par_pages = num_raid_units * pages_in_unit * @@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) u64 residual = ios->reading ? or->in.residual : or->out.residual; u64 offset = (ios->offset + ios->length) - residual; - struct ore_dev *od = ios->oc->ods[ - per_dev->dev - ios->oc->first_dev]; + unsigned dev = per_dev->dev - ios->oc->first_dev; + struct ore_dev *od = ios->oc->ods[dev]; - on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, + on_dev_error(ios, od, dev, osi.osd_err_pri, offset, residual); } if (osi.osd_err_pri >= acumulated_osd_err) { diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 29c47e5c4a8..d222c77cfa1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios) /* @si contains info of the to-be-inserted page. Update of @si should be * maintained by caller. Specificaly si->dev, si->obj_offset, ... */ -static int _add_to_read_4_write(struct ore_io_state *ios, - struct ore_striping_info *si, struct page *page) +static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, + struct page *page, unsigned pg_len) { struct request_queue *q; struct ore_per_dev_state *per_dev; @@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios, _ore_add_sg_seg(per_dev, gap, true); } q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); - added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); - if (unlikely(added_len != PAGE_SIZE)) { + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, + si->obj_offset % PAGE_SIZE); + if (unlikely(added_len != pg_len)) { ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", per_dev->bio->bi_vcnt); return -ENOMEM; } - per_dev->length += PAGE_SIZE; + per_dev->length += pg_len; return 0; } +/* read the beginning of an unaligned first page */ +static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) +{ + struct ore_striping_info si; + unsigned pg_len; + + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); + + pg_len = si.obj_offset % PAGE_SIZE; + si.obj_offset -= pg_len; + + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", + _LLU(si.obj_offset), pg_len, page->index, si.dev); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +/* read the end of an incomplete last page */ +static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) +{ + struct ore_striping_info si; + struct page *page; + unsigned pg_len, p, c; + + ore_calc_stripe_info(ios->layout, *offset, 0, &si); + + p = si.unit_off / PAGE_SIZE; + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, + ios->layout->mirrors_p1, si.par_dev, si.dev); + page = ios->sp2d->_1p_stripes[p].pages[c]; + + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); + *offset += pg_len; + + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); + + BUG_ON(!page); + + return _add_to_r4w(ios, &si, page, pg_len); +} + static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) { struct bio_vec *bv; @@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios) struct page **pp = &_1ps->pages[c]; bool uptodate; - if (*pp) + if (*pp) { + if (ios->offset % PAGE_SIZE) + /* Read the remainder of the page */ + _add_to_r4w_first_page(ios, *pp); /* to-be-written pages start here */ goto read_last_stripe; + } *pp = ios->r4w->get_page(ios->private, offset, &uptodate); @@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios) return -ENOMEM; if (!uptodate) - _add_to_read_4_write(ios, &read_si, *pp); + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); /* Mark read-pages to be cache_released */ _1ps->page_is_read[c] = true; @@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios) } read_last_stripe: - offset = ios->offset + (ios->length + PAGE_SIZE - 1) / - PAGE_SIZE * PAGE_SIZE; + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); + /* offset will be aligned to next page */ + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) * bytes_in_stripe; if (offset == last_stripe_end) /* Optimize for the aligned case */ @@ -503,7 +553,7 @@ read_last_stripe: /* Mark read-pages to be cache_released */ _1ps->page_is_read[c] = true; if (!uptodate) - _add_to_read_4_write(ios, &read_si, page); + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); } offset += PAGE_SIZE; @@ -551,7 +601,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios, unsigned cur_len) { if (ios->reading) { - BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); + if (per_dev->cur_sg >= ios->sgs_per_dev) { + ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , + per_dev->cur_sg, ios->sgs_per_dev); + return -ENOMEM; + } _ore_add_sg_seg(per_dev, cur_len, true); } else { struct __stripe_pages_2d *sp2d = ios->sp2d; @@ -612,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) return -ENOMEM; } - BUG_ON(ios->offset % PAGE_SIZE); - /* Round io down to last full strip */ first_stripe = div_u64(ios->offset, stripe_size); last_stripe = div_u64(ios->offset + ios->length, stripe_size); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index e6085ec192d..d22cd168c6e 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -166,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) static void exofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); } @@ -839,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); if (ret) { EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + dput(sb->s_root); + sb->s_root = NULL; goto free_sbi; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 47cda410b54..d37df352d32 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -279,7 +279,7 @@ static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; else diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 9a4e5e206d0..75ad433c669 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); /* ialloc.c */ -extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *); +extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); extern void ext2_check_inodes_bitmap (struct super_block *); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index c4e81dfb74b..8b15cf8cef3 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -429,7 +429,7 @@ found: return group; } -struct inode *ext2_new_inode(struct inode *dir, int mode, +struct inode *ext2_new_inode(struct inode *dir, umode_t mode, const struct qstr *qstr) { struct super_block *sb; @@ -573,8 +573,11 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + ext2_error(sb, "ext2_new_inode", + "inode number already in use - inode=%lu", + (unsigned long) ino); + err = -EIO; + goto fail; } dquot_initialize(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 91a6945af6d..740cad8dcd8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,7 +26,6 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/module.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/mpage.h> @@ -36,10 +35,6 @@ #include "acl.h" #include "xip.h" -MODULE_AUTHOR("Remy Card and others"); -MODULE_DESCRIPTION("Second Extended Filesystem"); -MODULE_LICENSE("GPL"); - static int __ext2_write_inode(struct inode *inode, int do_sync); /* diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index f81e250ac5c..2de655f5d62 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -35,7 +35,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETFLAGS: { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -77,31 +77,41 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = flags & EXT2_FL_USER_MODIFIABLE; flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; ei->i_flags = flags; - mutex_unlock(&inode->i_mutex); ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; + mutex_unlock(&inode->i_mutex); + mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); - case EXT2_IOC_SETVERSION: + case EXT2_IOC_SETVERSION: { + __u32 generation; + if (!inode_owner_or_capable(inode)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; - if (get_user(inode->i_generation, (int __user *) arg)) { + if (get_user(generation, (int __user *) arg)) { ret = -EFAULT; - } else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); + goto setversion_out; } - mnt_drop_write(filp->f_path.mnt); + + mutex_lock(&inode->i_mutex); + inode->i_ctime = CURRENT_TIME_SEC; + inode->i_generation = generation; + mutex_unlock(&inode->i_mutex); + + mark_inode_dirty(inode); +setversion_out: + mnt_drop_write_file(filp); return ret; + } case EXT2_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -121,7 +131,7 @@ setflags_out: if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -145,7 +155,7 @@ setflags_out: rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return 0; } default: diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 761fde807fc..080419814ba 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -94,7 +94,7 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -119,7 +119,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st return ext2_add_nondir(dentry, inode); } -static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -214,7 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index bd8ac164a3b..0090595beb2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -173,7 +173,6 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) static void ext2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } @@ -211,9 +210,9 @@ static void destroy_inodecache(void) kmem_cache_destroy(ext2_inode_cachep); } -static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext2_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -1521,5 +1520,8 @@ static void __exit exit_ext2_fs(void) exit_ext2_xattr(); } +MODULE_AUTHOR("Remy Card and others"); +MODULE_DESCRIPTION("Second Extended Filesystem"); +MODULE_LICENSE("GPL"); module_init(init_ext2_fs) module_exit(exit_ext2_fs) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index d27b71f1d18..6dcafc7efdf 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -54,7 +54,6 @@ */ #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mbcache.h> diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index c922adc8ef4..be7a8d02c9a 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/fs.h> diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 667e46a8d62..2989467d359 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 099d20f4716..f470e44c4b8 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -6,7 +6,6 @@ */ #include <linux/init.h> -#include <linux/module.h> #include <linux/string.h> #include "ext2.h" #include "xattr.h" diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 5c866e06e7a..1cde2843801 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -371,7 +371,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * group to find a free inode. */ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, - const struct qstr *qstr, int mode) + const struct qstr *qstr, umode_t mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; @@ -525,8 +525,12 @@ got: if (IS_DIRSYNC(inode)) handle->h_sync = 1; if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 85fe655fe3e..2d0afeca0b4 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,7 +22,6 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/ext3_jbd.h> @@ -223,8 +222,12 @@ void ext3_evict_inode (struct inode *inode) * * Note that directories do not have this problem because they don't * use page cache. + * + * The s_journal check handles the case when ext3_get_journal() fails + * and puts the journal inode. */ if (inode->i_nlink && ext3_should_journal_data(inode) && + EXT3_SB(inode->i_sb)->s_journal && (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { tid_t commit_tid = atomic_read(&ei->i_datasync_tid); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; @@ -1132,9 +1135,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode, bh = ext3_getblk(handle, inode, block, create, err); if (!bh) return bh; - if (buffer_uptodate(bh)) + if (bh_uptodate_or_lock(bh)) return bh; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1617,7 +1622,13 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); /* * We give up here if we're reentered, because it might be for a @@ -1692,7 +1703,13 @@ static int ext3_writeback_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto out_fail; @@ -1735,7 +1752,13 @@ static int ext3_journalled_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); - WARN_ON_ONCE(IS_RDONLY(inode)); + /* + * We don't want to warn for emergency remount. The condition is + * ordered to avoid dereferencing inode->i_sb in non-error case to + * avoid slow-downs. + */ + WARN_ON_ONCE(IS_RDONLY(inode) && + !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); if (ext3_journal_current_handle()) goto no_write; @@ -2064,12 +2087,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from) if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + if (!bh_uptodate_or_lock(bh)) { + err = bh_submit_read(bh); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err) goto unlock; } @@ -2490,7 +2511,7 @@ int ext3_can_truncate(struct inode *inode) * transaction, and VFS/VM ensures that ext3_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index ba1b54e23ca..4af574ce4a4 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -44,7 +44,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -110,7 +110,7 @@ flags_err: err = ext3_change_inode_journal_flag(inode, jflag); flags_out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GETVERSION: @@ -126,7 +126,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { @@ -134,10 +134,11 @@ flags_out: goto setversion_out; } + mutex_lock(&inode->i_mutex); handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto setversion_out; + goto unlock_out; } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { @@ -146,8 +147,11 @@ flags_out: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GETRSVSZ: @@ -164,7 +168,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -195,7 +199,7 @@ setversion_out: } mutex_unlock(&ei->truncate_mutex); setrsvsz_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_EXTEND: { @@ -206,7 +210,7 @@ setrsvsz_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -221,7 +225,7 @@ setrsvsz_out: if (err == 0) err = err2; group_extend_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT3_IOC_GROUP_ADD: { @@ -232,7 +236,7 @@ group_extend_out: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -249,7 +253,7 @@ group_extend_out: if (err == 0) err = err2; group_add_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case FITRIM: { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 642dc6d66df..e8e211795e9 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -921,9 +921,12 @@ restart: num++; bh = ext3_getblk(NULL, dir, b++, 0, &err); bh_use[ra_max] = bh; - if (bh) - ll_rw_block(READ | REQ_META | REQ_PRIO, - 1, &bh); + if (bh && !bh_uptodate_or_lock(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, + bh); + } } } if ((bh = bh_use[ra_ptr++]) == NULL) @@ -1698,7 +1701,7 @@ static int ext3_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, +static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { handle_t *handle; @@ -1732,7 +1735,7 @@ retry: } static int ext3_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1768,7 +1771,7 @@ retry: return err; } -static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; struct inode * inode; @@ -2272,7 +2275,7 @@ retry: err = PTR_ERR(handle); goto err_drop_inode; } - inc_nlink(inode); + set_nlink(inode, 1); err = ext3_orphan_del(handle, inode); if (err) { ext3_journal_stop(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 922d289aeeb..726c7ef6cdf 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -511,7 +511,6 @@ static int ext3_drop_inode(struct inode *inode) static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); } @@ -611,9 +610,9 @@ static char *data_mode_string(unsigned long mode) * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext3_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; unsigned long def_mount_opts; @@ -2060,9 +2059,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { + ext3_mark_recovery_complete(sb, es); ext3_msg(sb, KERN_INFO, "recovery complete"); - ext3_mark_recovery_complete(sb, es); + } ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": @@ -2230,11 +2230,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { - ext3_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; + if (!bh_uptodate_or_lock(journal->j_sb_buffer)) { + if (bh_submit_read(journal->j_sb_buffer)) { + ext3_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext3_msg(sb, KERN_ERR, @@ -2910,7 +2910,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) return -EXDEV; /* Journaling quota? */ if (EXT3_SB(sb)->s_qf_names[type]) { diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3c218b8a51d..ea26f2acab9 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/fs.h> diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index dc8edda9ffe..2526a8829de 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 7a321974d58..b32e473a1e3 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -5,7 +5,6 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/ext3_jbd.h> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 12ccacda44e..f9e2cd8cf71 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -23,6 +23,8 @@ #include <trace/events/ext4.h> +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ -unsigned ext4_num_base_meta_clusters(struct super_block *sb, +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8efb2f0a344..3f11656bd72 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -13,7 +13,6 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> -#include <linux/module.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5b0e26a1272..513004fc3d8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,6 +511,14 @@ struct ext4_new_group_data { __u32 free_blocks_count; }; +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + /* * Flags used by ext4_map_blocks() */ @@ -575,6 +583,7 @@ struct ext4_new_group_data { /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -957,12 +966,13 @@ struct ext4_inode_info { #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) -#define ext4_set_bit __test_and_set_bit_le +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le #define ext4_set_bit_atomic ext2_set_bit_atomic -#define ext4_clear_bit __test_and_clear_bit_le +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le #define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le -#define ext4_find_first_zero_bit find_first_zero_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb, extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group); extern unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); @@ -1819,7 +1830,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, +extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner); extern void ext4_free_inode(handle_t *, struct inode *); @@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_discard_partial_page_buffers(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length, int flags); -extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb, extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern void *ext4_kvmalloc(size_t size, gfp_t flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 607b1557d29..74f23c292e1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -29,7 +29,6 @@ * - smart tree reduction */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> @@ -3281,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t i, pg_lblk; pgoff_t index; + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + /* reverse search wont work if fs block size is less than page size */ if (inode->i_blkbits < PAGE_CACHE_SHIFT) search_hint_reverse = 0; @@ -3453,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" - "block %llu, max_blocks %u, flags %d, allocated %u", + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); @@ -3625,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); ext4_lblk_t ex_cluster_start, ex_cluster_end; - ext4_lblk_t rr_cluster_start, rr_cluster_end; + ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); @@ -3636,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb, /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); - rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 00beb4f9cc4..25d8c9781ad 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = ext4_journal_get_write_access(handle, bh2); } ext4_lock_group(sb, block_group); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; @@ -351,14 +351,14 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, */ static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode, + ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; + unsigned int freei, avefreei, grp_free; ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; @@ -477,8 +477,8 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { + grp_free = ext4_free_inodes_count(sb, desc); + if (desc && grp_free && grp_free >= avefreei) { *group = grp; return 0; } @@ -497,7 +497,7 @@ fallback_retry: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); @@ -602,7 +602,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ static int ext4_claim_inode(struct super_block *sb, struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, int mode) + unsigned long ino, ext4_group_t group, umode_t mode) { int free = 0, retval = 0, count; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb, */ down_read(&grp->alloc_sem); ext4_lock_group(sb, group); - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; goto err_ret; @@ -690,7 +690,7 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner) { struct super_block *sb; @@ -885,8 +885,12 @@ got: if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3cfc73fbca8..830e1b2bf14 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,7 +20,6 @@ * (sct@redhat.com), 1993, 1998 */ -#include <linux/module.h> #include "ext4_jbd2.h" #include "truncate.h" diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 92655fd8965..feaa82fe629 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -18,7 +18,6 @@ * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> @@ -72,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); /* * Test whether an inode is a fast symlink. @@ -1881,7 +1883,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); * a[0] = 'a'; * truncate(f, 4096); * we have in the page first buffer_head mapped via page_mkwrite call back - * but other bufer_heads would be unmapped but dirty(dirty done via the + * but other buffer_heads would be unmapped but dirty (dirty done via the * do_wp_page). So writepage should write the first block. If we modify * the mmap area beyond 1024 we will again get a page_fault and the * page_mkwrite callback will do the block allocation and mark the @@ -2760,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!io_end || !size) goto out; - ext_debug("ext4_end_io_dio(): io_end 0x%p" + ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); @@ -3161,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle, * * Returns zero on sucess or negative on failure. */ -int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, struct inode *inode, struct page *page, loff_t from, loff_t length, int flags) { @@ -3301,126 +3303,6 @@ next: return err; } -/* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; - ext4_lblk_t iblock; - struct inode *inode = mapping->host; - struct buffer_head *bh; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; - - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - err = 0; - if (buffer_freed(bh)) { - BUFFER_TRACE(bh, "freed: skip"); - goto unlock; - } - - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto unlock; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) - goto unlock; - } - - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto unlock; - } - - zero_user(page, offset, length); - - BUFFER_TRACE(bh, "zeroed end of block"); - - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - -unlock: - unlock_page(page); - page_cache_release(page); - return err; -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -3469,7 +3351,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * - * As we work through the truncate and commmit bits of it to the journal there + * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * @@ -4647,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); /* * OK, there are no updates running now, and all cached data is @@ -4661,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - else + else { + jbd2_journal_flush(journal); ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a56796814d6..6eee25591b8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -18,6 +18,8 @@ #include "ext4_jbd2.h" #include "ext4.h" +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; @@ -45,7 +47,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -134,7 +136,7 @@ flags_err: err = ext4_ext_migrate(inode); flags_out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT4_IOC_GETVERSION: @@ -150,7 +152,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { @@ -158,10 +160,11 @@ flags_out: goto setversion_out; } + mutex_lock(&inode->i_mutex); handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto setversion_out; + goto unlock_out; } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { @@ -170,8 +173,11 @@ flags_out: err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } case EXT4_IOC_GROUP_EXTEND: { @@ -182,19 +188,22 @@ setversion_out: if (err) return err; - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_extend_out; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) - return err; + goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { @@ -204,9 +213,9 @@ setversion_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); +group_extend_out: ext4_resize_end(sb); - return err; } @@ -240,15 +249,14 @@ setversion_out: return -EOPNOTSUPP; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) goto mext_out; err = ext4_move_extents(filp, donor_filp, me.orig_start, me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); mnt_drop_write(filp->f_path.mnt); - if (me.moved_len > 0) - file_remove_suid(donor_filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) @@ -267,19 +275,22 @@ mext_out: return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto group_add_out; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) - return err; + goto group_add_out; err = ext4_group_add(sb, &input); if (EXT4_SB(sb)->s_journal) { @@ -289,9 +300,9 @@ mext_out: } if (err == 0) err = err2; - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); +group_add_out: ext4_resize_end(sb); - return err; } @@ -301,7 +312,7 @@ mext_out: if (!inode_owner_or_capable(inode)) return -EACCES; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; /* @@ -313,7 +324,7 @@ mext_out: mutex_lock(&(inode->i_mutex)); err = ext4_ext_migrate(inode); mutex_unlock(&(inode->i_mutex)); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } @@ -323,11 +334,65 @@ mext_out: if (!inode_owner_or_capable(inode)) return -EACCES; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; err = ext4_alloc_da_blocks(inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err = 0, err2 = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with meta_bg"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + if (n_blocks_count > MAX_32_NUM && + !EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "File system only supports 32-bit block numbers"); + return -EOPNOTSUPP; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; mnt_drop_write(filp->f_path.mnt); +resizefs_out: + ext4_resize_end(sb); return err; } @@ -429,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case FITRIM: + case EXT4_IOC_RESIZE_FS: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2d8be8f28b..cb990b21c69 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 16ac228dbec..e7d6bb0acfa 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -12,7 +12,6 @@ * */ -#include <linux/module.h> #include <linux/slab.h> #include "ext4_jbd2.h" diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index aa4c782c9dd..2043f482375 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1736,7 +1736,7 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, +static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { handle_t *handle; @@ -1770,7 +1770,7 @@ retry: } static int ext4_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1806,7 +1806,7 @@ retry: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2315,7 +2315,7 @@ retry: err = PTR_ERR(handle); goto err_drop_inode; } - inc_nlink(inode); + set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); if (err) { ext4_journal_stop(handle); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7e106c810c6..47585189651 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -6,7 +6,6 @@ * Written by Theodore Ts'o, 2010. */ -#include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 996780ab4f4..f9d948f0eb8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb, return err; } +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + */ +static void ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_bg_has_super(sb, src_group) ? + (1 + ext4_bg_num_gdb(sb, src_group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + + start_blk += overhead; + + BUG_ON(src_group >= group_data[0].group + flex_gd->count); + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) + if (!ext4_bg_has_super(sb, src_group)) + last_blk += group_data[src_group - group].blocks_count; + else + break; + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count -= + EXT4_SB(sb)->s_itb_per_group; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } +} + static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { @@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh) } /* - * Set up the block and inode bitmaps, and the inode table for the new group. + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + ext4_get_group_no_and_offset(sb, block, &group, NULL); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = sb->s_blocksize * 8 - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (!bh) + return -EIO; + + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. * This doesn't need to be part of the main transaction, since we are only * changing blocks outside the actual filesystem. We still do journaling to * ensure the recovery is correct in case of a failure just after resize. * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. */ -static int setup_new_group_blocks(struct super_block *sb, - struct ext4_new_group_data *input) +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) { + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); - int reserved_gdb = ext4_bg_has_super(sb, input->group) ? - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; - unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); - struct buffer_head *bh; + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; handle_t *handle; - ext4_fsblk_t block; - ext4_grpblk_t bit; - int i; - int err = 0, err2; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - if (IS_ERR(handle)) return PTR_ERR(handle); - BUG_ON(input->group != sbi->s_groups_count); + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; - /* Copy all of the GDT blocks into the backup in this group */ - for (i = 0, bit = 1, block = start + 1; - i < gdblocks; i++, block++, bit++) { - struct buffer_head *gdb; + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); - ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - err = extend_or_restart_transaction(handle, 1); - if (err) - goto exit_journal; + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + struct buffer_head *gdb; - gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, gdb))) { + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto out; + } + + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } brelse(gdb); - goto exit_journal; } - memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); - set_buffer_uptodate(gdb); - err = ext4_handle_dirty_metadata(handle, NULL, gdb); - if (unlikely(err)) { - brelse(gdb); - goto exit_journal; + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; } - brelse(gdb); - } - /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, - GFP_NOFS); - if (err) - goto exit_journal; + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; - err = extend_or_restart_transaction(handle, 2); - if (err) - goto exit_journal; + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; - bh = bclean(handle, sb, input->block_bitmap); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto exit_journal; - } +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup group tables %#04llx (+0)\n", start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); - } + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; - ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, - input->block_bitmap - start); - ext4_set_bit(input->block_bitmap - start, bh->b_data); - ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, - input->inode_bitmap - start); - ext4_set_bit(input->inode_bitmap - start, bh->b_data); - - /* Zero out all of the inode table blocks */ - block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); - if (err) - goto exit_bh; - ext4_set_bits(bh->b_data, input->inode_table - start, - sbi->s_itb_per_group); + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + if (ext4_bg_has_super(sb, group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + + 1); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; - ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_bh; + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); } - brelse(bh); - /* Mark unused entries in inode bitmap used */ - ext4_debug("clear inode bitmap %#04llx (+%llu)\n", - input->inode_bitmap, input->inode_bitmap - start); - if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = group_data[i].block_bitmap; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } } - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - ext4_std_error(sb, err); -exit_bh: +out: brelse(bh); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; return err; @@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; - const ext4_group_t end = EXT4_SB(sb)->s_groups_count; unsigned three = 1; unsigned five = 5; unsigned seven = 7; @@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (!gdb_bh) return -EIO; - gdbackups = verify_reserved_gdb(sb, gdb_bh); + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto exit_bh; @@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EIO; goto exit_bh; } - if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; @@ -735,6 +1021,348 @@ exit_err: } } +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else + err = add_new_gdb(handle, resize_inode, group); + if (err) + break; + } + return err; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic_add(EXT4_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = flex_gd->count * 4 + reserved_gdb; + handle = ext4_journal_start_sb(sb, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int i; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + for (i = 0; i < flex_gd->count; i++, group++) { + struct buffer_head *gdb_bh; + int gdb_num; + gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size); + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_bg_has_super(sb, group + i) ? + (1 + ext4_bg_num_gdb(sb, group + i) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && + EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it @@ -750,16 +1378,15 @@ exit_err: */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { + struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct buffer_head *primary = NULL; - struct ext4_group_desc *gdp; struct inode *inode = NULL; - handle_t *handle; int gdb_off, gdb_num; - int err, err2; + int err; + __u16 bg_flags = 0; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); @@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } - if ((err = verify_group_input(sb, input))) - goto exit_put; + err = verify_group_input(sb, input); + if (err) + goto out; - if ((err = setup_new_group_blocks(sb, input))) - goto exit_put; + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ - /* - * We will always be modifying at least the superblock and a GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). */ - handle = ext4_journal_start_sb(sb, - ext4_bg_has_super(sb, input->group) ? - 3 + reserved_gdb : 4); + handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto exit_put; + ext4_warning(sb, "error %d on journal start", err); + return err; } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) - goto exit_journal; - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - primary = sbi->s_group_desc[gdb_num]; - if ((err = ext4_journal_get_write_access(handle, primary))) - goto exit_journal; - - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { - err = reserve_backup_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - } - } else { - /* - * Note that we can access new group descriptor block safely - * only if add_new_gdb() succeeds. - */ - err = add_new_gdb(handle, inode, input->group); - if (err) - goto exit_journal; - primary = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; } - /* - * OK, now we've set up the new group. Time to make it active. - * - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ - - /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)primary->b_data + - gdb_off * EXT4_DESC_SIZE(sb)); - - memset(gdp, 0, EXT4_DESC_SIZE(sb)); - ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ - ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ - ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count); - ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); - - /* - * We can allocate memory for mb_alloc based on the new group - * descriptor - */ - err = ext4_mb_add_groupinfo(sb, input->group, gdp); + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) - goto exit_journal; - - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - ext4_blocks_count_set(es, ext4_blocks_count(es) + - input->blocks_count); - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers must perform a smp_wmb() after updating all dependent - * data and before modifying the groups count - * - * * Readers must perform an smp_rmb() after reading the groups count - * and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count++; - - err = ext4_handle_dirty_metadata(handle, NULL, primary); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_journal; - } - - /* Update the reserved block counts only once the new group is - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - input->reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, input->free_blocks_count)); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT4_INODES_PER_GROUP(sb)); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { - ext4_group_t flex_group; - flex_group = ext4_flex_group(sbi, input->group); - atomic_add(EXT4_B2C(sbi, input->free_blocks_count), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(EXT4_INODES_PER_GROUP(sb), - &sbi->s_flex_groups[flex_group].free_inodes); - } - + goto errout; ext4_handle_dirty_super(handle, sb); - -exit_journal: - if ((err2 = ext4_journal_stop(handle)) && !err) + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) err = err2; - if (!err && primary) { - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); - update_backups(sb, primary->b_blocknr, primary->b_data, - primary->b_size); } -exit_put: - iput(inode); return err; -} /* ext4_group_add */ +} /* * Extend the filesystem to the new number of blocks specified. This entry @@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; - handle_t *handle; - int err, err2; + int err; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); @@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, } brelse(bh); - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext4_free_blocks(). - */ - handle = ext4_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext4_warning(sb, "error %d on journal start", err); - goto exit_put; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + return err; +} /* ext4_group_extend */ + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode; + ext4_fsblk_t o_blocks_count; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_grpblk_t offset; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + unsigned long desc_blocks; + int err = 0, flexbg_size = 1; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; } - if ((err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, "error %d on journal write access", err); - ext4_journal_stop(handle); - goto exit_put; + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); + + n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / + EXT4_DESC_PER_BLOCK(sb); + o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + desc_blocks = n_desc_blocks - o_desc_blocks; + + if (desc_blocks && + (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || + le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { + ext4_warning(sb, "No reserved GDT blocks, can't resize"); + return -EPERM; } - ext4_blocks_count_set(es, o_blocks_count + add); - ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - /* We add the blocks to the bitmap and set the group need init bit */ - err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); - ext4_handle_dirty_super(handle, sb); - ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - err2 = ext4_journal_stop(handle); - if (!err && err2) - err = err2; - if (err) - goto exit_put; + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + if (offset != 0) { + /* extend the last group */ + ext4_grpblk_t add; + add = EXT4_BLOCKS_PER_GROUP(sb) - offset; + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + es->s_log_groups_per_flex) + flexbg_size = 1 << es->s_log_groups_per_flex; + + o_blocks_count = ext4_blocks_count(es); + if (o_blocks_count == n_blocks_count) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + + iput(resize_inode); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", - ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); -exit_put: + printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " + "upto %llu blocks\n", o_blocks_count, n_blocks_count); return err; -} /* ext4_group_extend */ +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e1329e2f82..502c61fd739 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -930,7 +930,6 @@ static int ext4_drop_inode(struct inode *inode) static void ext4_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } @@ -1033,11 +1032,11 @@ static inline void ext4_show_quota_options(struct seq_file *seq, * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ -static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ext4_show_options(struct seq_file *seq, struct dentry *root) { int def_errors; unsigned long def_mount_opts; - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = root->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -1096,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) } if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { seq_printf(seq, ",max_batch_time=%u", - (unsigned) sbi->s_min_batch_time); + (unsigned) sbi->s_max_batch_time); } /* @@ -2006,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group_count; ext4_group_t flex_group; - int groups_per_flex = 0; + unsigned int groups_per_flex = 0; size_t size; int i; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - if (groups_per_flex < 2) { + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; /* We allocate both existing and potentially added groups */ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + @@ -2883,8 +2881,7 @@ cont_thread: } mutex_unlock(&eli->li_list_mtx); - if (freezing(current)) - refrigerator(); + try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || @@ -3508,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); @@ -3735,10 +3732,12 @@ no_journal: } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + iput(root); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { + iput(root); ext4_msg(sb, KERN_ERR, "get root dentry failed"); ret = -ENOMEM; goto failed_mount4; @@ -3775,7 +3774,7 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4; + goto failed_mount4a; } ext4_ext_init(sb); @@ -3832,13 +3831,14 @@ cantfind_ext4: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_ext_release(sb); -failed_mount5: ext4_mb_release(sb); +failed_mount5: + ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4: - iput(root); +failed_mount4a: + dput(sb->s_root); sb->s_root = NULL; +failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: @@ -4782,7 +4782,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) return -EXDEV; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 34e4350dd4d..d2a200624af 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -3,7 +3,6 @@ * Handler for storing security labels as extended attributes. */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/security.h> @@ -48,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 37e6ebca2cc..95f1f4ab59a 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -5,7 +5,6 @@ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 98c375352d0..0edb7611ffb 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -5,7 +5,6 @@ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include "ext4_jbd2.h" diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 1510a4d5199..66994f316e1 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -141,7 +141,7 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) static inline int fat_mode_can_hold_ro(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - mode_t mask; + umode_t mask; if (S_ISDIR(inode->i_mode)) { if (!sbi->options.rodir) @@ -156,8 +156,8 @@ static inline int fat_mode_can_hold_ro(struct inode *inode) } /* Convert attribute bits and a mask to the UNIX mode. */ -static inline mode_t fat_make_mode(struct msdos_sb_info *sbi, - u8 attrs, mode_t mode) +static inline umode_t fat_make_mode(struct msdos_sb_info *sbi, + u8 attrs, umode_t mode) { if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) mode &= ~S_IWUGO; diff --git a/fs/fat/file.c b/fs/fat/file.c index c118acf16e4..a71fe3715ee 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -44,7 +44,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) goto out; mutex_lock(&inode->i_mutex); - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out_unlock_inode; @@ -108,7 +108,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out_unlock_inode: mutex_unlock(&inode->i_mutex); out: @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(fat_getattr); static int fat_sanitize_mode(const struct msdos_sb_info *sbi, struct inode *inode, umode_t *mode_ptr) { - mode_t mask, perm; + umode_t mask, perm; /* * Note, the basic check is already done by a caller of @@ -351,7 +351,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) { - mode_t allow_utime = sbi->options.allow_utime; + umode_t allow_utime = sbi->options.allow_utime; if (current_fsuid() != inode->i_uid) { if (in_group_p(inode->i_gid)) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 808cac7edcf..3ab841054d5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -518,7 +518,6 @@ static struct inode *fat_alloc_inode(struct super_block *sb) static void fat_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } @@ -672,7 +671,7 @@ int fat_sync_inode(struct inode *inode) EXPORT_SYMBOL_GPL(fat_sync_inode); -static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); +static int fat_show_options(struct seq_file *m, struct dentry *root); static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, .destroy_inode = fat_destroy_inode, @@ -811,9 +810,9 @@ static const struct export_operations fat_export_ops = { .get_parent = fat_get_parent, }; -static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) +static int fat_show_options(struct seq_file *m, struct dentry *root) { - struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb); + struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); struct fat_mount_options *opts = &sbi->options; int isvfat = opts->isvfat; @@ -898,7 +897,7 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, + Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, }; @@ -928,17 +927,17 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, - {Opt_obsolate, "conv=binary"}, - {Opt_obsolate, "conv=text"}, - {Opt_obsolate, "conv=auto"}, - {Opt_obsolate, "conv=b"}, - {Opt_obsolate, "conv=t"}, - {Opt_obsolate, "conv=a"}, - {Opt_obsolate, "fat=%u"}, - {Opt_obsolate, "blocksize=%u"}, - {Opt_obsolate, "cvf_format=%20s"}, - {Opt_obsolate, "cvf_options=%100s"}, - {Opt_obsolate, "posix"}, + {Opt_obsolete, "conv=binary"}, + {Opt_obsolete, "conv=text"}, + {Opt_obsolete, "conv=auto"}, + {Opt_obsolete, "conv=b"}, + {Opt_obsolete, "conv=t"}, + {Opt_obsolete, "conv=a"}, + {Opt_obsolete, "fat=%u"}, + {Opt_obsolete, "blocksize=%u"}, + {Opt_obsolete, "cvf_format=%20s"}, + {Opt_obsolete, "cvf_options=%100s"}, + {Opt_obsolete, "posix"}, {Opt_err, NULL}, }; static const match_table_t msdos_tokens = { @@ -1170,7 +1169,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, break; /* obsolete mount options */ - case Opt_obsolate: + case Opt_obsolete: fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " "not supported now", p); break; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 216b419f30e..c5938c9084b 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -264,7 +264,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, +static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; @@ -346,7 +346,7 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index a87a65663c2..a81eb2367d3 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, int charlen; if (utf8) { - *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); + *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN, + (wchar_t *) outname, FAT_LFN_LEN + 2); if (*outlen < 0) return *outlen; else if (*outlen > FAT_LFN_LEN) @@ -781,7 +782,7 @@ error: return ERR_PTR(err); } -static int vfat_create(struct inode *dir, struct dentry *dentry, int mode, +static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; @@ -870,7 +871,7 @@ out: return err; } -static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; diff --git a/fs/fhandle.c b/fs/fhandle.c index 6b088641f5b..a48e4a139be 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -10,6 +10,7 @@ #include <linux/personality.h> #include <asm/uaccess.h> #include "internal.h" +#include "mount.h" static long do_sys_name_to_handle(struct path *path, struct file_handle __user *ufh, @@ -24,8 +25,8 @@ static long do_sys_name_to_handle(struct path *path, * We need t make sure wether the file system * support decoding of the file handle */ - if (!path->mnt->mnt_sb->s_export_op || - !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + if (!path->dentry->d_sb->s_export_op || + !path->dentry->d_sb->s_export_op->fh_to_dentry) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) @@ -66,7 +67,8 @@ static long do_sys_name_to_handle(struct path *path, } else retval = 0; /* copy the mount id */ - if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + if (copy_to_user(mnt_id, &real_mount(path->mnt)->mnt_id, + sizeof(*mnt_id)) || copy_to_user(ufh, handle, sizeof(struct file_handle) + handle_bytes)) retval = -EFAULT; diff --git a/fs/file_table.c b/fs/file_table.c index c322794f736..20002e39754 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -474,29 +474,6 @@ void file_sb_list_del(struct file *file) #endif -int fs_may_remount_ro(struct super_block *sb) -{ - struct file *file; - /* Check that no files are currently opened for writing. */ - lg_global_lock(files_lglock); - do_file_list_for_each_entry(sb, file) { - struct inode *inode = file->f_path.dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; - } while_file_list_for_each_entry; - lg_global_unlock(files_lglock); - return 1; /* Tis' cool bro. */ -too_bad: - lg_global_unlock(files_lglock); - return 0; -} - /** * mark_files_ro - mark all files read-only * @sb: superblock in question diff --git a/fs/filesystems.c b/fs/filesystems.c index 0845f84f2a5..96f24286667 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -74,7 +74,6 @@ int register_filesystem(struct file_system_type * fs) BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; - INIT_LIST_HEAD(&fs->fs_supers); write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 7b2af5abe2f..cf9ef918a2a 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -187,10 +187,10 @@ vxfs_stiget(struct super_block *sbp, ino_t ino) * vxfs_transmod returns a Linux mode_t for a given * VxFS inode structure. */ -static __inline__ mode_t +static __inline__ umode_t vxfs_transmod(struct vxfs_inode_info *vip) { - mode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; + umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; if (VXFS_ISFIFO(vip)) ret |= S_IFIFO; @@ -340,7 +340,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino) static void vxfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(vxfs_inode_cachep, inode->i_private); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 517f211a3bd..5b4a9362d5a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -20,16 +20,21 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> -#include <linux/buffer_head.h> #include <linux/tracepoint.h> #include "internal.h" /* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + +/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -48,14 +53,6 @@ struct wb_writeback_work { }; /* - * Include the creation of the trace points after defining the - * wb_writeback_work structure so that the definition remains local to this - * file. - */ -#define CREATE_TRACE_POINTS -#include <trace/events/writeback.h> - -/* * We don't actually have pdflush, but this one is exported though /proc... */ int nr_pdflush_threads; @@ -87,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure and inline functions so that the definition + * remains local to this file. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/writeback.h> + /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { @@ -743,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh(wb->bdi)) break; + /* + * Kupdate and background works are special and we want to + * include all inodes that need writing. Livelock avoidance is + * handled by these works yielding to any other work so we are + * safe. + */ if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - work->older_than_this = &oldest_jif; - } + } else if (work->for_background) + oldest_jif = jiffies; trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) @@ -937,7 +948,7 @@ int bdi_writeback_thread(void *data) trace_writeback_thread_start(bdi); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { /* * Remove own delayed wake-up timer, since we are already awake * and we'll take care of the preriodic write-back. @@ -967,8 +978,6 @@ int bdi_writeback_thread(void *data) */ schedule(); } - - try_to_freeze(); } /* Flush any work that raced with us exiting */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2aaf3eaaf13..5f3368ab0fa 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + up_read(&fc->killsb); + kfree(buf); + return err; + +err: + kfree(buf); + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_delete_out outarg; + int err = -ENOMEM; + char *buf; + struct qstr name; + + buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); + if (!buf) + goto err; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, + outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RETRIEVE: return fuse_notify_retrieve(fc, size, cs); + case FUSE_NOTIFY_DELETE: + return fuse_notify_delete(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9f63e493a9b..206632887bb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -369,8 +369,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, - struct nameidata *nd) +static int fuse_create_open(struct inode *dir, struct dentry *entry, + umode_t mode, struct nameidata *nd) { int err; struct inode *inode; @@ -480,7 +480,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, */ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct inode *dir, struct dentry *entry, - int mode) + umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -547,7 +547,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, return err; } -static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, +static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; @@ -573,7 +573,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, return create_new_entry(fc, req, dir, entry, mode); } -static int fuse_create(struct inode *dir, struct dentry *entry, int mode, +static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, struct nameidata *nd) { if (nd) { @@ -585,7 +585,7 @@ static int fuse_create(struct inode *dir, struct dentry *entry, int mode, return fuse_mknod(dir, entry, mode, 0); } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) +static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); @@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, - struct qstr *name) + u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; struct inode *parent; @@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); + + if (child_nodeid != 0 && entry->d_inode) { + mutex_lock(&entry->d_inode->i_mutex); + if (get_node_id(entry->d_inode) != child_nodeid) { + err = -ENOENT; + goto badentry; + } + if (d_mountpoint(entry)) { + err = -EBUSY; + goto badentry; + } + if (S_ISDIR(entry->d_inode->i_mode)) { + shrink_dcache_parent(entry); + if (!simple_empty(entry)) { + err = -ENOTEMPTY; + goto badentry; + } + entry->d_inode->i_flags |= S_DEAD; + } + dont_mount(entry); + clear_nlink(entry->d_inode); + err = 0; + badentry: + mutex_unlock(&entry->d_inode->i_mutex); + if (!err) + d_delete(entry); + } else { + err = 0; + } dput(entry); - err = 0; unlock: mutex_unlock(&parent->i_mutex); @@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, return fuse_fsync_common(file, start, end, datasync, 1); } +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + + if (fc->minor < 18) + return -ENOTTY; + + return fuse_ioctl_common(file, cmd, arg, + FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); +} + static bool update_mtime(unsigned ivalid) { /* Always update if mtime is explicitly set */ @@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = { .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, + .unlocked_ioctl = fuse_dir_ioctl, + .compat_ioctl = fuse_dir_compat_ioctl, }; static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0c84100acd4..4a199fd93fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) loff_t retval; struct inode *inode = file->f_path.dentry->d_inode; - mutex_lock(&inode->i_mutex); - if (origin != SEEK_CUR && origin != SEEK_SET) { - retval = fuse_update_attributes(inode, NULL, file, NULL); - if (retval) - goto exit; - } + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ + if (origin == SEEK_CUR || origin == SEEK_SET) + return generic_file_llseek(file, offset, origin); - switch (origin) { - case SEEK_END: - offset += i_size_read(inode); - break; - case SEEK_CUR: - if (offset == 0) { - retval = file->f_pos; - goto exit; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset >= i_size_read(inode)) { - retval = -ENXIO; - goto exit; - } - break; - case SEEK_HOLE: - if (offset >= i_size_read(inode)) { - retval = -ENXIO; - goto exit; - } - offset = i_size_read(inode); - break; - } - retval = -EINVAL; - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; - } -exit: + mutex_lock(&inode->i_mutex); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, origin); mutex_unlock(&inode->i_mutex); + return retval; } @@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); + pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } EXPORT_SYMBOL_GPL(fuse_do_ioctl); -static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) { struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); @@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, static long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, 0); + return fuse_ioctl_common(file, cmd, arg, 0); } static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); + return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); } /* diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cf6db0a9321..572cefc7801 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -80,7 +80,7 @@ struct fuse_inode { /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ - mode_t orig_i_mode; + umode_t orig_i_mode; /** Version of last attribute change */ u64 attr_version; @@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, /** * File-system tells the kernel to invalidate parent attributes and * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + * - matches the inode number for the dentry matching parent/name, + * - is not a mount point + * - is a file or oan empty directory + * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, - struct qstr *name); + u64 child_nodeid, struct qstr *name); int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); @@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags); unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index aa83109b943..64cf8d07393 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -107,7 +107,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) static void fuse_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fuse_inode_cachep, inode); } @@ -498,9 +497,10 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) return 1; } -static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) +static int fuse_show_options(struct seq_file *m, struct dentry *root) { - struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb); + struct super_block *sb = root->d_sb; + struct fuse_conn *fc = get_fuse_conn_super(sb); seq_printf(m, ",user_id=%u", fc->user_id); seq_printf(m, ",group_id=%u", fc->group_id); @@ -510,9 +510,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); - if (mnt->mnt_sb->s_bdev && - mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 65978d7885c..230eb0f005b 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -38,8 +38,9 @@ static const char *gfs2_acl_name(int type) return NULL; } -static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type) { + struct gfs2_inode *ip = GFS2_I(inode); struct posix_acl *acl; const char *name; char *data; @@ -67,11 +68,6 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type) return acl; } -struct posix_acl *gfs2_get_acl(struct inode *inode, int type) -{ - return gfs2_acl_get(GFS2_I(inode), type); -} - static int gfs2_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -125,7 +121,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) if (S_ISLNK(inode->i_mode)) return 0; - acl = gfs2_acl_get(dip, ACL_TYPE_DEFAULT); + acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) { @@ -166,7 +162,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) unsigned int len; int error; - acl = gfs2_acl_get(ip, ACL_TYPE_ACCESS); + acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) @@ -216,7 +212,7 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, if (type < 0) return type; - acl = gfs2_acl_get(GFS2_I(inode), type); + acl = gfs2_get_acl(inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4858e1fed8b..501e5cba09b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; - struct gfs2_alloc *al = NULL; + struct gfs2_qadata *qa = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); struct page *page; @@ -639,8 +639,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); if (alloc_required) { - al = gfs2_alloc_get(ip); - if (!al) { + qa = gfs2_qadata_get(ip); + if (!qa) { error = -ENOMEM; goto out_unlock; } @@ -649,8 +649,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (error) goto out_alloc_put; - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (error) goto out_qunlock; } @@ -711,7 +710,7 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } out_unlock: if (&ip->i_inode == sdp->sd_rindex) { @@ -848,7 +847,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; @@ -880,10 +879,11 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, brelse(dibh); failed: gfs2_trans_end(sdp); - if (al) { + if (ip->i_res) gfs2_inplace_release(ip); + if (qa) { gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } if (inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 41d494d7970..14a70401597 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -133,7 +133,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) and write it out to disk */ unsigned int n = 1; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) goto out_brelse; if (isdir) { @@ -503,7 +503,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, do { int error; n = blks - alloced; - error = gfs2_alloc_block(ip, &bn, &n); + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) return error; alloced += n; @@ -743,9 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, else if (ip->i_depth) revokes = sdp->sd_inptrs; - if (error) - return error; - memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); bstart = 0; blen = 0; @@ -1044,7 +1041,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; find_metapath(sdp, lblock, &mp, ip->i_height); - if (!gfs2_alloc_get(ip)) + if (!gfs2_qadata_get(ip)) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -1064,7 +1061,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size) gfs2_quota_unhold(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -1166,21 +1163,20 @@ static int do_grow(struct inode *inode, u64 size) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; - struct gfs2_alloc *al = NULL; + struct gfs2_qadata *qa = NULL; int error; if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { - al = gfs2_alloc_get(ip); - if (al == NULL) + qa = gfs2_qadata_get(ip); + if (qa == NULL) return -ENOMEM; error = gfs2_quota_lock_check(ip); if (error) goto do_grow_alloc_put; - al->al_requested = 1; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, 1); if (error) goto do_grow_qunlock; } @@ -1189,7 +1185,7 @@ static int do_grow(struct inode *inode, u64 size) if (error) goto do_grow_release; - if (al) { + if (qa) { error = gfs2_unstuff_dinode(ip, NULL); if (error) goto do_end_trans; @@ -1208,12 +1204,12 @@ static int do_grow(struct inode *inode, u64 size) do_end_trans: gfs2_trans_end(sdp); do_grow_release: - if (al) { + if (qa) { gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); do_grow_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } return error; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8ccad2467cb..c35573abd37 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -76,6 +76,8 @@ #define IS_LEAF 1 /* Hashed (leaf) directory */ #define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ + #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) @@ -821,7 +823,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_dirent *dent; struct qstr name = { .name = "", .len = 0, .hash = 0 }; - error = gfs2_alloc_block(ip, &bn, &n); + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) return NULL; bh = gfs2_meta_new(ip->i_gl, bn); @@ -1376,6 +1378,52 @@ out: return error; } +/** + * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. + */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count; + + /* First check if we've already read-ahead for the whole range. */ + if (index + MAX_RA_BLOCKS < f_ra->start) + return; + + f_ra->start = max((pgoff_t)index, f_ra->start); + for (count = 0; count < MAX_RA_BLOCKS; count++) { + if (f_ra->start >= hsize) /* if exceeded the hash table */ + break; + + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]); + f_ra->start++; + if (blocknr == last) + continue; + + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META, bh); + continue; + } + brelse(bh); + } +} /** * dir_e_read - Reads the entries from a directory into a filldir buffer @@ -1388,7 +1436,7 @@ out: */ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) + filldir_t filldir, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); u32 hsize, len = 0; @@ -1402,10 +1450,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, hash = gfs2_dir_offset2hash(*offset); index = hash >> (32 - dip->i_depth); + if (dip->i_hash_cache == NULL) + f_ra->start = 0; lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) return PTR_ERR(lp); + gfs2_dir_readahead(inode, hsize, index, f_ra); + while (index < hsize) { error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, &copied, &depth, @@ -1423,7 +1475,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, } int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir) + filldir_t filldir, struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1437,7 +1489,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, return 0; if (dip->i_diskflags & GFS2_DIF_EXHASH) - return dir_e_read(inode, offset, opaque, filldir); + return dir_e_read(inode, offset, opaque, filldir, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1798,7 +1850,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (!ht) return -ENOMEM; - if (!gfs2_alloc_get(dip)) { + if (!gfs2_qadata_get(dip)) { error = -ENOMEM; goto out; } @@ -1887,7 +1939,7 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out_put: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); out: kfree(ht); return error; diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index ff5772fbf02..98c960beab3 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir); + filldir_t filldir, struct file_ra_state *f_ra); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index fe9945f2ff7..70ba891654f 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -99,6 +99,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, struct gfs2_holder gh; u64 offset = 0; int error; + struct file_ra_state f_ra = { .start = 0 }; if (!dir) return -EINVAL; @@ -118,7 +119,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, if (error) return error; - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ce36a56dfea..c5fb3597f69 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -105,7 +105,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) return error; } - error = gfs2_dir_read(dir, &offset, dirent, filldir); + error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); @@ -223,7 +223,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) int error; u32 new_flags, flags; - error = mnt_want_write(filp->f_path.mnt); + error = mnt_want_write_file(filp); if (error) return error; @@ -285,7 +285,7 @@ out_trans_end: out: gfs2_glock_dq_uninit(&gh); out_drop_write: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return error; } @@ -365,7 +365,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; - struct gfs2_alloc *al; + struct gfs2_qadata *qa; loff_t size; int ret; @@ -393,16 +393,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) } ret = -ENOMEM; - al = gfs2_alloc_get(ip); - if (al == NULL) + qa = gfs2_qadata_get(ip); + if (qa == NULL) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - ret = gfs2_inplace_reserve(ip); + ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (ret) goto out_quota_unlock; @@ -448,7 +447,7 @@ out_trans_fail: out_quota_unlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&gh); out: @@ -609,7 +608,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); struct gfs2_inode *ip = GFS2_I(inode); - int ret, ret1 = 0; + int ret = 0, ret1 = 0; if (mapping->nrpages) { ret1 = filemap_fdatawrite_range(mapping, start, end); @@ -750,8 +749,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, struct gfs2_inode *ip = GFS2_I(inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes; - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; + const loff_t pos = offset; + const loff_t count = len; loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; @@ -782,8 +783,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, while (len > 0) { if (len < bytes) bytes = len; - al = gfs2_alloc_get(ip); - if (!al) { + qa = gfs2_qadata_get(ip); + if (!qa) { error = -ENOMEM; goto out_unlock; } @@ -795,8 +796,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; @@ -810,7 +810,6 @@ retry: max_bytes = bytes; calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, &max_bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + RES_RG_HDR + gfs2_rg_blocks(ip); @@ -832,8 +831,11 @@ retry: offset += max_bytes; gfs2_inplace_release(ip); gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); } + + if (error == 0) + error = generic_write_sync(file, pos, count); goto out_unlock; out_trans_fail: @@ -841,7 +843,7 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_alloc_put: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); out_unlock: gfs2_glock_dq(&ip->i_gh); out_uninit: diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 88e8a23d002..351a3e79778 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -167,14 +167,19 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { - spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); spin_unlock(&lru_lock); } @@ -217,11 +222,12 @@ void gfs2_glock_put(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (atomic_dec_and_test(&gl->gl_ref)) { + if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); spin_lock_bucket(gl->gl_hash); hlist_bl_del_rcu(&gl->gl_list); spin_unlock_bucket(gl->gl_hash); - gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); @@ -1353,7 +1359,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) spin_lock(&gl->gl_spin); gl->gl_reply = ret; - if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2553b858a72..307ac31df78 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -121,8 +121,11 @@ enum { struct lm_lockops { const char *lm_proto_name; - int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); - void (*lm_unmount) (struct gfs2_sbd *sdp); + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct gfs2_glock *gl); int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7389dfdcc9e..97742a7ea9c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -139,8 +139,45 @@ struct gfs2_bufdata { #define GDLM_STRNAME_BYTES 25 #define GDLM_LVB_SIZE 32 +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + enum { DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, }; struct lm_lockname { @@ -244,17 +281,16 @@ struct gfs2_glock { #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ -struct gfs2_alloc { +struct gfs2_qadata { /* quota allocation data */ /* Quota stuff */ - struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; - struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; - unsigned int al_qd_num; - - u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ - u32 al_alloced; /* Filled in by gfs2_alloc_*() */ + struct gfs2_quota_data *qa_qd[2*MAXQUOTAS]; + struct gfs2_holder qa_qd_ghs[2*MAXQUOTAS]; + unsigned int qa_qd_num; +}; - /* Filled in by gfs2_inplace_reserve() */ - struct gfs2_holder al_rgd_gh; +struct gfs2_blkreserv { + u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */ }; enum { @@ -275,7 +311,8 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_alloc *i_alloc; + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_blkreserv *i_res; /* resource group block reservation */ struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; @@ -392,6 +429,7 @@ struct gfs2_jdesc { #define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; + int jd_recover_error; }; struct gfs2_statfs_change_host { @@ -461,6 +499,7 @@ enum { SDF_NORECOVERY = 4, SDF_DEMOTE = 5, SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ }; #define GFS2_FSNAME_LEN 256 @@ -499,14 +538,26 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_first_done; unsigned int ls_nodir; const struct lm_lockops *ls_ops; - unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid_done; - int ls_recover_jid_status; + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ }; struct gfs2_sbd { @@ -544,6 +595,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct delayed_work sd_control_work; /* Inode Stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index cfd4959b218..56987460cda 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -333,7 +333,7 @@ out: */ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, - unsigned int mode) + umode_t mode) { int error; @@ -364,7 +364,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, +static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, unsigned int *uid, unsigned int *gid) { if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && @@ -389,12 +389,9 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); int error; + int dblocks = 1; - if (gfs2_alloc_get(dip) == NULL) - return -ENOMEM; - - dip->i_alloc->al_requested = RES_DINODE; - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, RES_DINODE); if (error) goto out; @@ -402,14 +399,13 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - error = gfs2_alloc_di(dip, no_addr, generation); + error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); gfs2_trans_end(sdp); out_ipreserv: gfs2_inplace_release(dip); out: - gfs2_alloc_put(dip); return error; } @@ -447,7 +443,7 @@ static void gfs2_init_dir(struct buffer_head *dibh, */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - const struct gfs2_inum_host *inum, unsigned int mode, + const struct gfs2_inum_host *inum, umode_t mode, unsigned int uid, unsigned int gid, const u64 *generation, dev_t dev, const char *symname, unsigned size, struct buffer_head **bhp) @@ -516,7 +512,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, } static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, - unsigned int mode, const struct gfs2_inum_host *inum, + umode_t mode, const struct gfs2_inum_host *inum, const u64 *generation, dev_t dev, const char *symname, unsigned int size, struct buffer_head **bhp) { @@ -525,7 +521,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, int error; munge_mode_uid_gid(dip, &mode, &uid, &gid); - if (!gfs2_alloc_get(dip)) + if (!gfs2_qadata_get(dip)) return -ENOMEM; error = gfs2_quota_lock(dip, uid, gid); @@ -547,7 +543,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, out_quota: gfs2_quota_unlock(dip); out: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); return error; } @@ -555,13 +551,13 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int alloc_required; struct buffer_head *dibh; int error; - al = gfs2_alloc_get(dip); - if (!al) + qa = gfs2_qadata_get(dip); + if (!qa) return -ENOMEM; error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -576,9 +572,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); if (error) goto fail_quota_locks; @@ -601,9 +595,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto fail_end_trans; - inc_nlink(&ip->i_inode); - if (S_ISDIR(ip->i_inode.i_mode)) - inc_nlink(&ip->i_inode); + set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -619,11 +611,11 @@ fail_quota_locks: gfs2_quota_unlock(dip); fail: - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); return error; } -int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, +static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; @@ -659,7 +651,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, */ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, - unsigned int mode, dev_t dev, const char *symname, + umode_t mode, dev_t dev, const char *symname, unsigned int size, int excl) { const struct qstr *name = &dentry->d_name; @@ -728,9 +720,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, brelse(bh); gfs2_trans_end(sdp); - gfs2_inplace_release(dip); + /* Check if we reserved space in the rgrp. Function link_dinode may + not, depending on whether alloc is required. */ + if (dip->i_res) + gfs2_inplace_release(dip); gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); mark_inode_dirty(inode); gfs2_glock_dq_uninit_m(2, ghs); d_instantiate(dentry, inode); @@ -760,7 +755,7 @@ fail: */ static int gfs2_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) + umode_t mode, struct nameidata *nd) { int excl = 0; if (nd && (nd->flags & LOOKUP_EXCL)) @@ -875,8 +870,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, error = 0; if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(dip); - if (!al) { + struct gfs2_qadata *qa = gfs2_qadata_get(dip); + + if (!qa) { error = -ENOMEM; goto out_gunlock; } @@ -885,9 +881,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_alloc; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); + error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); if (error) goto out_gunlock_q; @@ -930,7 +924,7 @@ out_gunlock_q: gfs2_quota_unlock(dip); out_alloc: if (alloc_required) - gfs2_alloc_put(dip); + gfs2_qadata_put(dip); out_gunlock: gfs2_glock_dq(ghs + 1); out_child: @@ -1037,12 +1031,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - int error; + int error = -EROFS; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + if (!rgd) + goto out_inodes; + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); @@ -1088,12 +1085,13 @@ out_end_trans: out_gunlock: gfs2_glock_dq(ghs + 2); out_rgrp: - gfs2_holder_uninit(ghs + 2); gfs2_glock_dq(ghs + 1); out_child: - gfs2_holder_uninit(ghs + 1); gfs2_glock_dq(ghs); out_parent: + gfs2_holder_uninit(ghs + 2); +out_inodes: + gfs2_holder_uninit(ghs + 1); gfs2_holder_uninit(ghs); return error; } @@ -1129,7 +1127,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, * Returns: errno */ -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); } @@ -1143,7 +1141,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) * */ -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); @@ -1350,8 +1348,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = 0; if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(ndip); - if (!al) { + struct gfs2_qadata *qa = gfs2_qadata_get(ndip); + + if (!qa) { error = -ENOMEM; goto out_gunlock; } @@ -1360,9 +1359,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_alloc; - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(ndip); + error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); if (error) goto out_gunlock_q; @@ -1423,7 +1420,7 @@ out_gunlock_q: gfs2_quota_unlock(ndip); out_alloc: if (alloc_required) - gfs2_alloc_put(ndip); + gfs2_qadata_put(ndip); out_gunlock: while (x--) { gfs2_glock_dq(ghs + x); @@ -1584,7 +1581,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) ogid = ngid = NO_QUOTA_CHANGE; - if (!gfs2_alloc_get(ip)) + if (!gfs2_qadata_get(ip)) return -ENOMEM; error = gfs2_quota_lock(ip, nuid, ngid); @@ -1616,7 +1613,7 @@ out_end_trans: out_gunlock_q: gfs2_quota_unlock(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 98c80d8c2a6..8944d1e32ab 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1,6 +1,6 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -11,12 +11,15 @@ #include <linux/dlm.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/delay.h> #include <linux/gfs2_ondisk.h> #include "incore.h" #include "glock.h" #include "util.h" +#include "sys.h" +extern struct workqueue_struct *gfs2_control_wq; static void gdlm_ast(void *arg) { @@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl) dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); } -static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(uint32_t)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + uint32_t gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); +} + +static int all_jid_bits_clear(char *lvb) +{ + int i; + for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { + if (lvb[i]) + return 0; + } + return 1; +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - if (fsname == NULL) { - fs_info(sdp, "no fsname found\n"); - return -EINVAL; + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + fs_info(sdp, "control_mount wait1 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; } - error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL | - (ls->ls_nodir ? DLM_LSFL_NODIR : 0), - GDLM_LVB_SIZE); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int dlm_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char lvb_bits[GDLM_LVB_SIZE]; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(lvb_bits, 0, sizeof(lvb_bits)); + control_lvb_write(ls, start_gen, lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); if (error) - printk(KERN_ERR "dlm_new_lockspace error %d", error); + fs_err(sdp, "control_first_done control NL error %d\n", error); return error; } +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accomodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + + if (old_size >= max_jid + 1) + return 0; + + new_size = old_size + RECOVER_SIZE_INC; + + submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + if (ls->ls_nodir) + flags |= DLM_LSFL_NODIR; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + static void gdlm_unmount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work_sync(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: if (ls->ls_dlm) { dlm_release_lockspace(ls->ls_dlm, 2); ls->ls_dlm = NULL; } + + free_recover_size(ls); } static const match_table_t dlm_tokens = { @@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = { const struct lm_lockops gfs2_dlm_ops = { .lm_proto_name = "lock_dlm", .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, .lm_unmount = gdlm_unmount, .lm_put_lock = gdlm_put_lock, .lm_lock = gdlm_lock, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 59864643436..756fae9eaf8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -626,7 +626,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); else - submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh); + submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) @@ -951,8 +951,8 @@ int gfs2_logd(void *data) wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; - if (freezing(current)) - refrigerator(); + + try_to_freeze(); do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 8a139ff1919..a8d9bcd0e19 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -28,6 +28,8 @@ #include "recovery.h" #include "dir.h" +struct workqueue_struct *gfs2_control_wq; + static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, .seeks = DEFAULT_SEEKS, @@ -40,7 +42,8 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); - ip->i_alloc = NULL; + ip->i_qadata = NULL; + ip->i_res = NULL; ip->i_hash_cache = NULL; } @@ -145,12 +148,19 @@ static int __init init_gfs2_fs(void) if (!gfs_recovery_wq) goto fail_wq; + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_control; + gfs2_register_debugfs(); printk("GFS2 installed\n"); return 0; +fail_control: + destroy_workqueue(gfs_recovery_wq); fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: @@ -194,6 +204,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_control_wq); rcu_barrier(); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index be29858900f..181586e673f 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; @@ -444,7 +444,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(READA, 1, &bh); + ll_rw_block(READA | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index cb23c2be731..24f609c9ef9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio); + submit_bio(READ_SYNC | REQ_META, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { @@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { char *message = "FIRSTMOUNT=Done"; char *envp[] = { message, NULL }; - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ls->ls_first_done = 1; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } @@ -796,6 +800,11 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get quota file inode: %d\n", error); goto fail_rindex; } + + error = gfs2_rindex_update(sdp); + if (error) + goto fail_qinode; + return 0; fail_qinode: @@ -944,7 +953,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) struct gfs2_args *args = &sdp->sd_args; const char *proto = sdp->sd_proto_name; const char *table = sdp->sd_table_name; - const char *fsname; char *o, *options; int ret; @@ -1004,21 +1012,12 @@ hostdata_error: } } - if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); - else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, - sdp->sd_lockstruct.ls_jid); - - fsname = strchr(table, ':'); - if (fsname) - fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); return 0; } - ret = lm->lm_mount(sdp, fsname); + ret = lm->lm_mount(sdp, table); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); complete_all(&sdp->sd_locking_init); @@ -1084,7 +1083,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; - set_bit(SDF_NORECOVERY, &sdp->sd_flags); + set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; @@ -1124,6 +1123,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail; + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + gfs2_create_debugfs_file(sdp); error = gfs2_sys_fs_add(sdp); @@ -1160,6 +1161,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_sb; } + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7e528dc14f8..a45b21b0391 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -494,11 +494,11 @@ static void qdsb_put(struct gfs2_quota_data *qd) int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_quota_data **qd = al->al_qd; + struct gfs2_qadata *qa = ip->i_qadata; + struct gfs2_quota_data **qd = qa->qa_qd; int error; - if (gfs2_assert_warn(sdp, !al->al_qd_num) || + if (gfs2_assert_warn(sdp, !qa->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; @@ -508,20 +508,20 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) { error = qdsb_get(sdp, QUOTA_USER, uid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; } @@ -529,7 +529,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) error = qdsb_get(sdp, QUOTA_GROUP, gid, qd); if (error) goto out; - al->al_qd_num++; + qa->qa_qd_num++; qd++; } @@ -542,16 +542,16 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; unsigned int x; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < al->al_qd_num; x++) { - qdsb_put(al->al_qd[x]); - al->al_qd[x] = NULL; + for (x = 0; x < qa->qa_qd_num; x++) { + qdsb_put(qa->qa_qd[x]); + qa->qa_qd[x] = NULL; } - al->al_qd_num = 0; + qa->qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -712,7 +712,7 @@ get_a_page: set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(READ | REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -762,7 +762,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) struct gfs2_quota_data *qd; loff_t offset; unsigned int nalloc = 0, blocks; - struct gfs2_alloc *al = NULL; int error; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), @@ -792,26 +791,19 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) nalloc++; } - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_gunlock; - } /* * 1 blk for unstuffing inode if stuffed. We add this extra * block to the reservation unconditionally. If the inode * doesn't need unstuffing, the block will be released to the * rgrp since it won't be allocated during the transaction */ - al->al_requested = 1; /* +3 in the end for unstuffing block, inode size update block * and another block in case quota straddles page boundary and * two blocks need to be updated instead of 1 */ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; - if (nalloc) - al->al_requested += nalloc * (data_blocks + ind_blocks); - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, 1 + + (nalloc * (data_blocks + ind_blocks))); if (error) goto out_alloc; @@ -840,8 +832,6 @@ out_end_trans: out_ipres: gfs2_inplace_release(ip); out_alloc: - gfs2_alloc_put(ip); -out_gunlock: gfs2_glock_dq_uninit(&i_gh); out: while (qx--) @@ -925,7 +915,7 @@ fail: int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; int error = 0; @@ -938,15 +928,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), + sort(qa->qa_qd, qa->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < al->al_qd_num; x++) { + for (x = 0; x < qa->qa_qd_num; x++) { int force = NO_FORCE; - qd = al->al_qd[x]; + qd = qa->qa_qd[x]; if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) force = FORCE; - error = do_glock(qd, force, &al->al_qd_ghs[x]); + error = do_glock(qd, force, &qa->qa_qd_ghs[x]); if (error) break; } @@ -955,7 +945,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -1000,7 +990,7 @@ static int need_sync(struct gfs2_quota_data *qd) void gfs2_quota_unlock(struct gfs2_inode *ip) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qda[4]; unsigned int count = 0; unsigned int x; @@ -1008,14 +998,14 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < al->al_qd_num; x++) { + for (x = 0; x < qa->qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = al->al_qd[x]; + qd = qa->qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_glock_dq_uninit(&qa->qa_qd_ghs[x]); if (sync && qd_trylock(qd)) qda[count++] = qd; @@ -1048,7 +1038,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type) int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; s64 value; unsigned int x; @@ -1060,8 +1050,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < qa->qa_qd_num; x++) { + qd = qa->qa_qd[x]; if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) @@ -1099,7 +1089,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_qadata *qa = ip->i_qadata; struct gfs2_quota_data *qd; unsigned int x; @@ -1108,8 +1098,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < al->al_qd_num; x++) { - qd = al->al_qd[x]; + for (x = 0; x < qa->qa_qd_num; x++) { + qd = qa->qa_qd[x]; if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { @@ -1427,8 +1417,8 @@ int gfs2_quotad(void *data) /* Check for & recover partially truncated inodes */ quotad_check_trunc_list(sdp); - if (freezing(current)) - refrigerator(); + try_to_freeze(); + t = min(quotad_timeo, statfs_timeo); prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); @@ -1529,7 +1519,6 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, unsigned int data_blocks, ind_blocks; unsigned int blocks = 0; int alloc_required; - struct gfs2_alloc *al; loff_t offset; int error; @@ -1594,15 +1583,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { - al = gfs2_alloc_get(ip); - if (al == NULL) - goto out_i; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - blocks = al->al_requested = 1 + data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); + blocks = 1 + data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip, blocks); if (error) - goto out_alloc; + goto out_i; blocks += gfs2_rg_blocks(ip); } @@ -1617,11 +1603,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, gfs2_trans_end(sdp); out_release: - if (alloc_required) { + if (alloc_required) gfs2_inplace_release(ip); -out_alloc: - gfs2_alloc_put(ip); - } out_i: gfs2_glock_dq_uninit(&i_gh); out_q: diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f2a02edcac8..963b2d75200 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, char env_status[20]; char *envp[] = { env_jid, env_status, NULL }; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; sprintf(env_jid, "JID=%d", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } void gfs2_recover_func(struct work_struct *work) @@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { @@ -577,6 +583,7 @@ fail_gunlock_j: fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: + jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); @@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); - return 0; + return wait ? jd->jd_recover_error : 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 96bd6d759f2..49ada95209d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -65,8 +65,8 @@ static const char valid_change[16] = { }; static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state, - unsigned int *n); + unsigned char old_state, + struct gfs2_bitmap **rbi); /** * gfs2_setbit - Set a bit in the bitmaps @@ -683,16 +683,21 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp) struct gfs2_glock *gl = ip->i_gl; struct gfs2_holder ri_gh; int error = 0; + int unlock_required = 0; /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); - if (error) - return error; + if (!gfs2_glock_is_locked_by_me(gl)) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + unlock_required = 1; + } if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); - gfs2_glock_dq_uninit(&ri_gh); + if (unlock_required) + gfs2_glock_dq_uninit(&ri_gh); mutex_unlock(&sdp->sd_rindex_mutex); } @@ -860,22 +865,36 @@ fail: } /** - * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode + * gfs2_qadata_get - get the struct gfs2_qadata structure for an inode * @ip: the incore GFS2 inode structure * - * Returns: the struct gfs2_alloc + * Returns: the struct gfs2_qadata */ -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int error; - BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); + BUG_ON(ip->i_qadata != NULL); + ip->i_qadata = kzalloc(sizeof(struct gfs2_qadata), GFP_NOFS); error = gfs2_rindex_update(sdp); if (error) fs_warn(sdp, "rindex update returns %d\n", error); - return ip->i_alloc; + return ip->i_qadata; +} + +/** + * gfs2_blkrsv_get - get the struct gfs2_blkreserv structure for an inode + * @ip: the incore GFS2 inode structure + * + * Returns: the struct gfs2_qadata + */ + +static struct gfs2_blkreserv *gfs2_blkrsv_get(struct gfs2_inode *ip) +{ + BUG_ON(ip->i_res != NULL); + ip->i_res = kzalloc(sizeof(struct gfs2_blkreserv), GFP_NOFS); + return ip->i_res; } /** @@ -890,15 +909,20 @@ struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) { - const struct gfs2_alloc *al = ip->i_alloc; + const struct gfs2_blkreserv *rs = ip->i_res; if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; - if (rgd->rd_free_clone >= al->al_requested) + if (rgd->rd_free_clone >= rs->rs_requested) return 1; return 0; } +static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk) +{ + return (bi->bi_start * GFS2_NBBY) + blk; +} + /** * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes * @rgd: The rgrp @@ -912,20 +936,20 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; - unsigned int n; struct gfs2_glock *gl; struct gfs2_inode *ip; int error; int found = 0; + struct gfs2_bitmap *bi; while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); - n = 1; - block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, - GFS2_BLKST_UNLINKED, &n); + block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, &bi); up_write(&sdp->sd_log_flush_lock); if (block == BFITNOENT) break; + + block = gfs2_bi2rgd_blk(bi, block); /* rgblk_search can return a block < goal, so we need to keep it marching forward. */ no_addr = block + rgd->rd_data0; @@ -977,8 +1001,8 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; - struct gfs2_alloc *al = ip->i_alloc; - int error, rg_locked; + struct gfs2_blkreserv *rs = ip->i_res; + int error, rg_locked, flags = LM_FLAG_TRY; int loops = 0; if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) @@ -997,7 +1021,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) error = 0; } else { error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_TRY, &al->al_rgd_gh); + flags, &rs->rs_rgd_gh); } switch (error) { case 0: @@ -1008,12 +1032,14 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); /* fall through */ case GLR_TRYFAILED: rgd = gfs2_rgrpd_get_next(rgd); - if (rgd == begin) + if (rgd == begin) { + flags = 0; loops++; + } break; default: return error; @@ -1023,6 +1049,13 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) return -ENOSPC; } +static void gfs2_blkrsv_put(struct gfs2_inode *ip) +{ + BUG_ON(ip->i_res == NULL); + kfree(ip->i_res); + ip->i_res = NULL; +} + /** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for @@ -1030,16 +1063,23 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip) +int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_blkreserv *rs; int error = 0; u64 last_unlinked = NO_BLOCK; int tries = 0; - if (gfs2_assert_warn(sdp, al->al_requested)) - return -EINVAL; + rs = gfs2_blkrsv_get(ip); + if (!rs) + return -ENOMEM; + + rs->rs_requested = requested; + if (gfs2_assert_warn(sdp, requested)) { + error = -EINVAL; + goto out; + } do { error = get_local_rgrp(ip, &last_unlinked); @@ -1056,6 +1096,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip) gfs2_log_flush(sdp, NULL); } while (tries++ < 3); +out: + if (error) + gfs2_blkrsv_put(ip); return error; } @@ -1068,10 +1111,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip) void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_alloc *al = ip->i_alloc; + struct gfs2_blkreserv *rs = ip->i_res; - if (al->al_rgd_gh.gh_gl) - gfs2_glock_dq_uninit(&al->al_rgd_gh); + if (rs->rs_rgd_gh.gh_gl) + gfs2_glock_dq_uninit(&rs->rs_rgd_gh); + gfs2_blkrsv_put(ip); } /** @@ -1108,39 +1152,35 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) } /** - * rgblk_search - find a block in @old_state, change allocation - * state to @new_state + * rgblk_search - find a block in @state * @rgd: the resource group descriptor * @goal: the goal block within the RG (start here to search for avail block) - * @old_state: GFS2_BLKST_XXX the before-allocation state to find - * @new_state: GFS2_BLKST_XXX the after-allocation block state - * @n: The extent length + * @state: GFS2_BLKST_XXX the before-allocation state to find + * @dinode: TRUE if the first block we allocate is for a dinode + * @rbi: address of the pointer to the bitmap containing the block found * - * Walk rgrp's bitmap to find bits that represent a block in @old_state. - * Add the found bitmap buffer to the transaction. - * Set the found bits to @new_state to change block's allocation state. + * Walk rgrp's bitmap to find bits that represent a block in @state. * * This function never fails, because we wouldn't call it unless we * know (from reservation results, etc.) that a block is available. * - * Scope of @goal and returned block is just within rgrp, not the whole - * filesystem. + * Scope of @goal is just within rgrp, not the whole filesystem. + * Scope of @returned block is just within bitmap, not the whole filesystem. * - * Returns: the block number allocated + * Returns: the block number found relative to the bitmap rbi */ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char old_state, unsigned char new_state, - unsigned int *n) + unsigned char state, + struct gfs2_bitmap **rbi) { struct gfs2_bitmap *bi = NULL; const u32 length = rgd->rd_length; u32 blk = BFITNOENT; unsigned int buf, x; - const unsigned int elen = *n; const u8 *buffer = NULL; - *n = 0; + *rbi = NULL; /* Find bitmap block that contains bits for goal block */ for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; @@ -1163,21 +1203,21 @@ do_search: bi = rgd->rd_bits + buf; if (test_bit(GBF_FULL, &bi->bi_flags) && - (old_state == GFS2_BLKST_FREE)) + (state == GFS2_BLKST_FREE)) goto skip; /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; WARN_ON(!buffer_uptodate(bi->bi_bh)); - if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; - blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state); + blk = gfs2_bitfit(buffer, bi->bi_len, goal, state); if (blk != BFITNOENT) break; - if ((goal == 0) && (old_state == GFS2_BLKST_FREE)) + if ((goal == 0) && (state == GFS2_BLKST_FREE)) set_bit(GBF_FULL, &bi->bi_flags); /* Try next bitmap block (wrap back to rgrp header if at end) */ @@ -1187,16 +1227,37 @@ skip: goal = 0; } - if (blk == BFITNOENT) - return blk; + if (blk != BFITNOENT) + *rbi = bi; - *n = 1; - if (old_state == new_state) - goto out; + return blk; +} + +/** + * gfs2_alloc_extent - allocate an extent from a given bitmap + * @rgd: the resource group descriptor + * @bi: the bitmap within the rgrp + * @blk: the block within the bitmap + * @dinode: TRUE if the first block we allocate is for a dinode + * @n: The extent length + * + * Add the found bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + * Returns: starting block number of the extent (fs scope) + */ +static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, + u32 blk, bool dinode, unsigned int *n) +{ + const unsigned int elen = *n; + u32 goal; + const u8 *buffer = NULL; + *n = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, blk, new_state); + bi, blk, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + (*n)++; goal = blk; while (*n < elen) { goal++; @@ -1206,11 +1267,12 @@ skip: GFS2_BLKST_FREE) break; gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, goal, new_state); + bi, goal, GFS2_BLKST_USED); (*n)++; } -out: - return (bi->bi_start * GFS2_NBBY) + blk; + blk = gfs2_bi2rgd_blk(bi, blk); + rgd->rd_last_alloc = blk + *n - 1; + return rgd->rd_data0 + blk; } /** @@ -1298,121 +1360,93 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) } /** - * gfs2_alloc_block - Allocate one or more blocks + * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode * @ip: the inode to allocate the block for * @bn: Used to return the starting block number - * @n: requested number of blocks/extent length (value/result) + * @ndata: requested number of blocks/extent length (value/result) + * @dinode: 1 if we're allocating a dinode block, else 0 + * @generation: the generation number of the inode * * Returns: 0 or error */ -int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, + bool dinode, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; - u32 goal, blk; - u64 block; + unsigned int ndata; + u32 goal, blk; /* block, within the rgrp scope */ + u64 block; /* block, within the file system scope */ int error; + struct gfs2_bitmap *bi; /* Only happens if there is a bug in gfs2, return something distinctive * to ensure that it is noticed. */ - if (al == NULL) + if (ip->i_res == NULL) return -ECANCELED; rgd = ip->i_rgd; - if (rgrp_contains_block(rgd, ip->i_goal)) + if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; else goal = rgd->rd_last_alloc; - blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); /* Since all blocks are reserved in advance, this shouldn't happen */ if (blk == BFITNOENT) goto rgrp_error; - rgd->rd_last_alloc = blk; - block = rgd->rd_data0 + blk; - ip->i_goal = block + *n - 1; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error == 0) { - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); - brelse(dibh); + block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); + ndata = *nblocks; + if (dinode) + ndata--; + + if (!dinode) { + ip->i_goal = block + ndata - 1; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = + (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di->di_goal_meta = di->di_goal_data = + cpu_to_be64(ip->i_goal); + brelse(dibh); + } } - if (rgd->rd_free < *n) + if (rgd->rd_free < *nblocks) goto rgrp_error; - rgd->rd_free -= *n; - - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); - gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - - al->al_alloced += *n; - - gfs2_statfs_change(sdp, 0, -(s64)*n, 0); - gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid); - - rgd->rd_free_clone -= *n; - trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED); - *bn = block; - return 0; - -rgrp_error: - gfs2_rgrp_error(rgd); - return -EIO; -} - -/** - * gfs2_alloc_di - Allocate a dinode - * @dip: the directory that the inode is going in - * @bn: the block number which is allocated - * @generation: the generation number of the inode - * - * Returns: 0 on success or error - */ - -int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc *al = dip->i_alloc; - struct gfs2_rgrpd *rgd = dip->i_rgd; - u32 blk; - u64 block; - unsigned int n = 1; - - blk = rgblk_search(rgd, rgd->rd_last_alloc, - GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); - - /* Since all blocks are reserved in advance, this shouldn't happen */ - if (blk == BFITNOENT) - goto rgrp_error; - - rgd->rd_last_alloc = blk; - block = rgd->rd_data0 + blk; - if (rgd->rd_free == 0) - goto rgrp_error; - - rgd->rd_free--; - rgd->rd_dinodes++; - *generation = rgd->rd_igeneration++; - if (*generation == 0) + rgd->rd_free -= *nblocks; + if (dinode) { + rgd->rd_dinodes++; *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; + } + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - al->al_alloced++; + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); + if (dinode) + gfs2_trans_add_unrevoke(sdp, block, 1); - gfs2_statfs_change(sdp, 0, -1, +1); - gfs2_trans_add_unrevoke(sdp, block, 1); + /* + * This needs reviewing to see why we cannot do the quota change + * at this point in the dinode case. + */ + if (ndata) + gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, + ip->i_inode.i_gid); - rgd->rd_free_clone--; - trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); + rgd->rd_free_clone -= *nblocks; + trace_gfs2_block_alloc(ip, block, *nblocks, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index cf5c5018019..ceec9106cdf 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -28,19 +28,19 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); -extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); -static inline void gfs2_alloc_put(struct gfs2_inode *ip) +extern struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip); +static inline void gfs2_qadata_put(struct gfs2_inode *ip) { - BUG_ON(ip->i_alloc == NULL); - kfree(ip->i_alloc); - ip->i_alloc = NULL; + BUG_ON(ip->i_qadata == NULL); + kfree(ip->i_qadata); + ip->i_qadata = NULL; } -extern int gfs2_inplace_reserve(struct gfs2_inode *ip); +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); +extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode, u64 *generation); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 71e420989f7..4553ce515f6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1284,18 +1284,18 @@ static int is_ancestor(const struct dentry *d1, const struct dentry *d2) /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure - * @mnt: vfsmount + * @root: root of this (sub)tree * * Returns: 0 on success or error code */ -static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +static int gfs2_show_options(struct seq_file *s, struct dentry *root) { - struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_sbd *sdp = root->d_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; int val; - if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + if (is_ancestor(root, sdp->sd_master_dir)) seq_printf(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); @@ -1399,8 +1399,9 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) static int gfs2_dinode_dealloc(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; + struct gfs2_qadata *qa; struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; int error; if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { @@ -1408,8 +1409,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) return -EIO; } - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -1423,8 +1424,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) goto out_qs; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) goto out_qs; @@ -1440,11 +1440,11 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) gfs2_trans_end(sdp); out_rg_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&gh); out_qs: gfs2_quota_unhold(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -1582,7 +1582,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) static void gfs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(gfs2_inode_cachep, inode); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 443cabcfcd2..d33172c291b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) ssize_t ret; int val = 0; - if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) val = 1; ret = sprintf(buf, "%d\n", val); return ret; @@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) val = simple_strtol(buf, NULL, 0); if (val == 1) - set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { - clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); smp_mb__after_clear_bit(); gfs2_glock_thaw(sdp); } else { @@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) goto out; if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) goto out; - sdp->sd_lockstruct.ls_first = first; - rv = 0; + sdp->sd_lockstruct.ls_first = first; + rv = 0; out: spin_unlock(&sdp->sd_jindex_spin); return rv ? rv : len; @@ -360,19 +360,14 @@ out: static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%d\n", ls->ls_first_done); + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); } -static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) { - unsigned jid; struct gfs2_jdesc *jd; int rv; - rv = sscanf(buf, "%u", &jid); - if (rv != 1) - return -EINVAL; - rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) @@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) } out: spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + rv = gfs2_recover_set(sdp, jid); + return rv ? rv : len; } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index e94560e836d..79182d6ad6a 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp); int gfs2_sys_init(void); void gfs2_sys_uninit(void); +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + #endif /* __SYS_DOT_H__ */ diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index f8f101ef600..125d4572e1c 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -30,9 +30,9 @@ struct gfs2_glock; * block, or all of the blocks in the rg, whichever is smaller */ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip) { - const struct gfs2_alloc *al = ip->i_alloc; - if (al->al_requested < ip->i_rgd->rd_length) - return al->al_requested + 1; + const struct gfs2_blkreserv *rs = ip->i_res; + if (rs->rs_requested < ip->i_rgd->rd_length) + return rs->rs_requested + 1; return ip->i_rgd->rd_length; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 71d7bf830c0..e9636591b5d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -321,11 +321,11 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, int leave) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -336,7 +336,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, gfs2_quota_unhold(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -549,9 +549,10 @@ int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) goto out; error = gfs2_ea_get_copy(ip, &el, data, len); - if (error == 0) - error = len; - *ppdata = data; + if (error < 0) + kfree(data); + else + *ppdata = data; out: brelse(el.el_bh); return error; @@ -609,7 +610,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) u64 block; int error; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, block, 1); @@ -671,7 +672,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - error = gfs2_alloc_block(ip, &block, &n); + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, block, 1); @@ -708,21 +709,19 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, unsigned int blks, ea_skeleton_call_t skeleton_call, void *private) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; struct buffer_head *dibh; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_lock_check(ip); if (error) goto out; - al->al_requested = blks; - - error = gfs2_inplace_reserve(ip); + error = gfs2_inplace_reserve(ip, blks); if (error) goto out_gunlock_q; @@ -751,7 +750,7 @@ out_ipres: out_gunlock_q: gfs2_quota_unlock(ip); out: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } @@ -991,7 +990,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - error = gfs2_alloc_block(ip, &blk, &n); + error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); if (error) return error; gfs2_trans_add_unrevoke(sdp, blk, 1); @@ -1435,9 +1434,9 @@ out: static int ea_dealloc_block(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd; struct buffer_head *dibh; + struct gfs2_holder gh; int error; rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); @@ -1446,8 +1445,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) return error; @@ -1471,7 +1469,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&gh); return error; } @@ -1484,11 +1482,11 @@ out_gunlock: int gfs2_ea_dealloc(struct gfs2_inode *ip) { - struct gfs2_alloc *al; + struct gfs2_qadata *qa; int error; - al = gfs2_alloc_get(ip); - if (!al) + qa = gfs2_qadata_get(ip); + if (!qa) return -ENOMEM; error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); @@ -1510,7 +1508,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) out_quota: gfs2_quota_unhold(ip); out_alloc: - gfs2_alloc_put(ip); + gfs2_qadata_put(ip); return error; } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index bce4eef91a0..62fc14ea4b7 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -186,7 +186,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -216,7 +216,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index ad97c2d5828..1bf967c6bfd 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -184,7 +184,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); +extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a1a9fdcd2a0..737dbeb6432 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -169,7 +169,7 @@ const struct address_space_operations hfs_aops = { /* * hfs_new_inode */ -struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) +struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 1b55f704fb2..8137fb3e678 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -133,9 +133,9 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) return 0; } -static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int hfs_show_options(struct seq_file *seq, struct dentry *root) { - struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb); + struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); @@ -170,7 +170,6 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) static void hfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4536cd3f15a..88e155f895c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -424,7 +424,7 @@ out: } static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -453,13 +453,13 @@ out: return res; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, +static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return hfsplus_mknod(dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d7674d051f5..21a5b7fc6db 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -402,7 +402,7 @@ void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); int hfsplus_cat_write_inode(struct inode *); -struct inode *hfsplus_new_inode(struct super_block *, int); +struct inode *hfsplus_new_inode(struct super_block *, umode_t); void hfsplus_delete_inode(struct inode *); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -419,7 +419,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_parse_options(char *, struct hfsplus_sb_info *); int hfsplus_parse_options_remount(char *input, int *force); void hfsplus_fill_defaults(struct hfsplus_sb_info *); -int hfsplus_show_options(struct seq_file *, struct vfsmount *); +int hfsplus_show_options(struct seq_file *, struct dentry *); /* super.c */ struct inode *hfsplus_iget(struct super_block *, unsigned long); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 40e1413be4c..6643b242bdd 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -378,7 +378,7 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, int mode) +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index fbaa6690c8e..f66c7655b3f 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -43,7 +43,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) unsigned int flags; int err = 0; - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out; @@ -94,7 +94,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) out_unlock_inode: mutex_unlock(&inode->i_mutex); out_drop_write: - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out: return err; } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bb62a588214..06fa5618600 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -206,9 +206,9 @@ done: return 1; } -int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) +int hfsplus_show_options(struct seq_file *seq, struct dentry *root) { - struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); + struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index d24a9b666a2..427682ca9e4 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); - hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, - sbi->hidden_dir); + if (!sbi->hidden_dir) { + mutex_unlock(&sbi->vh_mutex); + err = -ENOMEM; + goto out_put_root; + } + err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root, + &str, sbi->hidden_dir); mutex_unlock(&sbi->vh_mutex); + if (err) + goto out_put_hidden_dir; hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY); @@ -558,7 +565,6 @@ static void hfsplus_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index bf15a43016b..3cbfa93cd78 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -39,7 +39,7 @@ struct hostfs_iattr { unsigned int ia_valid; - mode_t ia_mode; + unsigned short ia_mode; uid_t ia_uid; gid_t ia_gid; loff_t ia_size; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2f72da5ae68..e130bd46d67 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -250,7 +250,6 @@ static void hostfs_evict_inode(struct inode *inode) static void hostfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kfree(HOSTFS_I(inode)); } @@ -259,9 +258,9 @@ static void hostfs_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, hostfs_i_callback); } -static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int hostfs_show_options(struct seq_file *seq, struct dentry *root) { - const char *root_path = vfs->mnt_sb->s_fs_info; + const char *root_path = root->d_sb->s_fs_info; size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) @@ -552,7 +551,7 @@ static int read_name(struct inode *ino, char *name) return 0; } -int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, +int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -677,7 +676,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) return err; } -int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) +int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; int err; @@ -701,7 +700,7 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; char *name; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index ea91fcb0ef9..30dd7b10b50 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,7 +8,7 @@ #include <linux/sched.h> #include "hpfs_fn.h" -static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -115,7 +115,7 @@ bail: return err; } -static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -201,7 +201,7 @@ bail: return err; } -static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 98580a3b500..3690467c944 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -181,7 +181,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) static void hpfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index f590b1160c6..d92f4ce8092 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -622,7 +622,6 @@ void hppfs_evict_inode(struct inode *ino) static void hppfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kfree(HPPFS_I(inode)); } @@ -726,7 +725,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) sb->s_fs_info = proc_mnt; err = -ENOMEM; - root_inode = get_inode(sb, dget(proc_mnt->mnt_sb->s_root)); + root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); if (!root_inode) goto out_mntput; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0be5a78598d..1e85a7ac021 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -447,8 +447,8 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, - gid_t gid, int mode, dev_t dev) +static struct inode *hugetlbfs_get_root(struct super_block *sb, + struct hugetlbfs_config *config) { struct inode *inode; @@ -456,9 +456,31 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, if (inode) { struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = uid; - inode->i_gid = gid; + inode->i_mode = S_IFDIR | config->mode; + inode->i_uid = config->uid; + inode->i_gid = config->gid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = HUGETLBFS_I(inode); + mpol_shared_policy_init(&info->policy, NULL); + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + } + return inode; +} + +static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct inode *dir, + umode_t mode, dev_t dev) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -500,20 +522,12 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, int mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; - gid_t gid; - - if (dir->i_mode & S_ISGID) { - gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - gid = current_fsgid(); - } - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev); + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -523,7 +537,7 @@ static int hugetlbfs_mknod(struct inode *dir, return error; } -static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) @@ -531,7 +545,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -541,15 +555,8 @@ static int hugetlbfs_symlink(struct inode *dir, { struct inode *inode; int error = -ENOSPC; - gid_t gid; - - if (dir->i_mode & S_ISGID) - gid = dir->i_gid; - else - gid = current_fsgid(); - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), - gid, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -576,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; @@ -666,7 +674,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -858,8 +865,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - inode = hugetlbfs_get_inode(sb, config.uid, config.gid, - S_IFDIR | config.mode, 0); + inode = hugetlbfs_get_root(sb, &config); if (!inode) goto out_free; @@ -957,8 +963,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, path.mnt = mntget(hugetlbfs_vfsmount); error = -ENOSPC; - inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), - current_fsgid(), S_IFREG | S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; diff --git a/fs/inode.c b/fs/inode.c index ee4e66b998f..83ab215baab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,7 @@ #include <linux/ima.h> #include <linux/cred.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ +#include <linux/ratelimit.h> #include "internal.h" /* @@ -191,6 +192,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) } inode->i_private = NULL; inode->i_mapping = mapping; + INIT_LIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif @@ -241,6 +243,11 @@ void __destroy_inode(struct inode *inode) BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); fsnotify_inode_delete(inode); + if (!inode->i_nlink) { + WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_acl); @@ -254,7 +261,6 @@ EXPORT_SYMBOL(__destroy_inode); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(inode_cachep, inode); } @@ -268,6 +274,82 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +/** + * drop_nlink - directly drop an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. In cases + * where we are attempting to track writes to the + * filesystem, a decrement to zero means an imminent + * write when the file is truncated and actually unlinked + * on the filesystem. + */ +void drop_nlink(struct inode *inode) +{ + WARN_ON(inode->i_nlink == 0); + inode->__i_nlink--; + if (!inode->i_nlink) + atomic_long_inc(&inode->i_sb->s_remove_count); +} +EXPORT_SYMBOL(drop_nlink); + +/** + * clear_nlink - directly zero an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. See + * drop_nlink() for why we care about i_nlink hitting zero. + */ +void clear_nlink(struct inode *inode) +{ + if (inode->i_nlink) { + inode->__i_nlink = 0; + atomic_long_inc(&inode->i_sb->s_remove_count); + } +} +EXPORT_SYMBOL(clear_nlink); + +/** + * set_nlink - directly set an inode's link count + * @inode: inode + * @nlink: new nlink (should be non-zero) + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. + */ +void set_nlink(struct inode *inode, unsigned int nlink) +{ + if (!nlink) { + clear_nlink(inode); + } else { + /* Yes, some filesystems do change nlink from zero to one */ + if (inode->i_nlink == 0) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink = nlink; + } +} +EXPORT_SYMBOL(set_nlink); + +/** + * inc_nlink - directly increment an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. Currently, + * it is only here for parity with dec_nlink(). + */ +void inc_nlink(struct inode *inode) +{ + if (WARN_ON(inode->i_nlink == 0)) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink++; +} +EXPORT_SYMBOL(inc_nlink); + void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); @@ -290,7 +372,6 @@ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); @@ -692,6 +773,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&sb->s_inode_lru_lock); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; dispose_list(&freeable); } @@ -855,8 +938,7 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ - if (!lockdep_match_class(&inode->i_mutex, - &type->i_mutex_key)) { + if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ @@ -883,6 +965,7 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } @@ -1508,7 +1591,7 @@ void file_update_time(struct file *file) if (sync_it & S_MTIME) inode->i_mtime = now; mark_inode_dirty_sync(inode); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); } EXPORT_SYMBOL(file_update_time); @@ -1568,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1586,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1616,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } @@ -1647,7 +1730,7 @@ EXPORT_SYMBOL(init_special_inode); * @mode: mode of the new inode */ void inode_init_owner(struct inode *inode, const struct inode *dir, - mode_t mode) + umode_t mode) { inode->i_uid = current_fsuid(); if (dir && dir->i_mode & S_ISGID) { diff --git a/fs/internal.h b/fs/internal.h index fe327c20af8..9962c59ba28 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -15,19 +15,14 @@ struct super_block; struct file_system_type; struct linux_binprm; struct path; +struct mount; /* * block_dev.c */ #ifdef CONFIG_BLOCK -extern struct super_block *blockdev_superblock; extern void __init bdev_cache_init(void); -static inline int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - extern int __sync_blockdev(struct block_device *bdev, int wait); #else @@ -35,11 +30,6 @@ static inline void bdev_cache_init(void) { } -static inline int sb_is_blkdev_sb(struct super_block *sb) -{ - return 0; -} - static inline int __sync_blockdev(struct block_device *bdev, int wait) { return 0; @@ -52,28 +42,17 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) extern void __init chrdev_init(void); /* - * exec.c - */ -extern int check_unsafe_exec(struct linux_binprm *); - -/* * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); extern int copy_mount_string(const void __user *, char **); -extern unsigned int mnt_get_count(struct vfsmount *mnt); -extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); extern struct vfsmount *lookup_mnt(struct path *); -extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, - struct vfsmount *); -extern void release_mounts(struct list_head *); -extern void umount_tree(struct vfsmount *, int, struct list_head *); -extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern int finish_automount(struct vfsmount *, struct path *); extern void mnt_make_longterm(struct vfsmount *); extern void mnt_make_shortterm(struct vfsmount *); +extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); @@ -98,10 +77,9 @@ extern struct file *get_empty_filp(void); */ extern int do_remount_sb(struct super_block *, int, void *, int); extern bool grab_super_passive(struct super_block *sb); -extern void __put_super(struct super_block *sb); -extern void put_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); +extern struct super_block *user_get_super(dev_t); /* * open.c @@ -111,7 +89,7 @@ extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); struct open_flags { int open_flag; - int mode; + umode_t mode; int acc_mode; int intent; }; diff --git a/fs/ioctl.c b/fs/ioctl.c index 1d9b9fcb2db..066836e8184 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd, error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) - error = -EINVAL; + error = -ENOTTY; out: return error; } diff --git a/fs/ioprio.c b/fs/ioprio.c index f79dab83e17..0f1b9515213 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio) if (err) return err; - task_lock(task); - do { - ioc = task->io_context; - /* see wmb() in current_io_context() */ - smp_read_barrier_depends(); - if (ioc) - break; - - ioc = alloc_io_context(GFP_ATOMIC, -1); - if (!ioc) { - err = -ENOMEM; - break; - } - task->io_context = ioc; - } while (1); - - if (!err) { - ioc->ioprio = ioprio; - ioc->ioprio_changed = 1; + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc_ioprio_changed(ioc, ioprio); + put_io_context(ioc); } - task_unlock(task); return err; } EXPORT_SYMBOL_GPL(set_task_ioprio); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index f950059525f..bd62c76fb5d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -85,7 +85,6 @@ static struct inode *isofs_alloc_inode(struct super_block *sb) static void isofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } @@ -170,8 +169,8 @@ struct iso9660_options{ unsigned char map; unsigned char check; unsigned int blocksize; - mode_t fmode; - mode_t dmode; + umode_t fmode; + umode_t dmode; gid_t gid; uid_t uid; char *iocharset; @@ -949,8 +948,11 @@ root_found: /* get the root dentry */ s->s_root = d_alloc_root(inode); - if (!(s->s_root)) - goto out_no_root; + if (!(s->s_root)) { + iput(inode); + error = -ENOMEM; + goto out_no_inode; + } kfree(opt.iocharset); diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 7d33de84f52..0e73f63d927 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -50,14 +50,14 @@ struct isofs_sb_info { unsigned int s_uid_set:1; unsigned int s_gid_set:1; - mode_t s_fmode; - mode_t s_dmode; + umode_t s_fmode; + umode_t s_dmode; gid_t s_gid; uid_t s_uid; struct nls_table *s_nls_iocharset; /* Native language support table */ }; -#define ISOFS_INVALID_MODE ((mode_t) -1) +#define ISOFS_INVALID_MODE ((umode_t) -1) static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) { diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index f94fc48ff3a..05f0754f2b4 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -453,8 +453,6 @@ out: * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. * - * Called with the journal lock held. - * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed @@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal) if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and + /* + * OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * * If the log is now empty, we need to work out which is the * next transaction ID we will write, and where it will - * start. */ - + * start. + */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; @@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal) spin_unlock(&journal->j_state_lock); return 1; } + spin_unlock(&journal->j_state_lock); + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by log_do_checkpoint() --- are flushed out before we + * drop the transactions from the journal. It's unlikely this will be + * necessary, especially with an appropriately sized journal, but we + * need this to guarantee correctness. Fortunately + * cleanup_journal_tail() doesn't get called all that often. + */ + if (journal->j_flags & JFS_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + + spin_lock(&journal->j_state_lock); + if (!tid_gt(first_tid, journal->j_tail_sequence)) { + spin_unlock(&journal->j_state_lock); + /* Someone else cleaned up journal so return 0 */ + return 0; + } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -537,7 +554,7 @@ int cleanup_journal_tail(journal_t *journal) * them. * * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 8799207df05..f2b9a571f4c 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -392,6 +392,12 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + journal_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index fea8dd661d2..59c09f9541b 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -166,7 +166,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&journal->j_state_lock); } else { /* @@ -721,7 +721,6 @@ static journal_t * journal_init_common (void) init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); - mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 5b43e96788e..008bf062fd2 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -20,6 +20,7 @@ #include <linux/fs.h> #include <linux/jbd.h> #include <linux/errno.h> +#include <linux/blkdev.h> #endif /* @@ -263,6 +264,9 @@ int journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; + /* Flush disk caches to get replayed data on the permanent storage */ + if (journal->j_flags & JFS_BARRIER) + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); return err; } diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 305a9076315..25c713e7071 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffer in the next + * transaction which is going to be started. + */ +void journal_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 7e59c6e66f9..7fce94b04bc 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks) * void journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * - * This locks out any further updates from being started, and blocks - * until all existing updates have completed, returning only once the - * journal is in a quiescent state with no updates running. - * - * The journal lock should not be held on entry. + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. + * + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. */ void journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal) spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); - - /* - * We have now established a barrier against other normal updates, but - * we also need to barrier against other journal_lock_updates() calls - * to make sure that we serialise special journal-locked operations - * too. - */ - mutex_lock(&journal->j_barrier); } /** @@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal) * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with journal_lock_updates(). - * - * Should be called without the journal lock held. */ void journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); - mutex_unlock(&journal->j_barrier); spin_lock(&journal->j_state_lock); --journal->j_barrier_count; spin_unlock(&journal->j_state_lock); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 16a698bd906..d49d202903f 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -565,7 +565,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 68d704db787..5069b847515 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD2: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + jbd2_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ jbd2_journal_switch_revoke_table(journal); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0fa0123151d..c0a5f9f1b12 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -173,7 +173,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); write_lock(&journal->j_state_lock); } else { /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 69fd9358811..30b2867d6cc 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flag clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffers in the next + * transaction which is going to be started. + */ +void jbd2_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd2_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd2_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd2_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0e41a4c080..35ae096bed5 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); + finish_wait(&journal->j_wait_updates, &wait); break; } - prepare_to_wait(&journal->j_wait_updates, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); write_unlock(&journal->j_state_lock); schedule(); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index be6169bd8ac..973ac5822bd 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -22,16 +22,16 @@ static int jffs2_readdir (struct file *, void *, filldir_t); -static int jffs2_create (struct inode *,struct dentry *,int, +static int jffs2_create (struct inode *,struct dentry *,umode_t, struct nameidata *); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, struct nameidata *); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct inode *,struct dentry *,const char *); -static int jffs2_mkdir (struct inode *,struct dentry *,int); +static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t); +static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); static int jffs2_rename (struct inode *, struct dentry *, struct inode *, struct dentry *); @@ -169,8 +169,8 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) /***********************************************************************/ -static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, - struct nameidata *nd) +static int jffs2_create(struct inode *dir_i, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct jffs2_raw_inode *ri; struct jffs2_inode_info *f, *dir_f; @@ -450,7 +450,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } -static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) +static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -618,7 +618,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev) +static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index e513f1913c1..eafb8d37a6f 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; - ret = c->mtd->erase(c->mtd, instr); + ret = mtd_erase(c->mtd, instr); if (!ret) return; @@ -335,13 +335,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl void *ebuf; uint32_t ofs; size_t retlen; - int ret = -EIO; - - if (c->mtd->point) { - unsigned long *wordebuf; + int ret; + unsigned long *wordebuf; - ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size, - &retlen, &ebuf, NULL); + ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen, + &ebuf, NULL); + if (ret != -EOPNOTSUPP) { if (ret) { D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); goto do_flash_read; @@ -349,7 +348,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (retlen < c->sector_size) { /* Don't muck about if it won't let us point to the whole erase sector */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); - c->mtd->unpoint(c->mtd, jeb->offset, retlen); + mtd_unpoint(c->mtd, jeb->offset, retlen); goto do_flash_read; } wordebuf = ebuf-sizeof(*wordebuf); @@ -358,7 +357,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (*++wordebuf != ~0) break; } while(--retlen); - c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size); + mtd_unpoint(c->mtd, jeb->offset, c->sector_size); if (retlen) { printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); @@ -381,7 +380,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl *bad_offset = ofs; - ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf); + ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf); if (ret) { printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); ret = -EIO; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 4b8afe39a87..2e0123867cb 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r if (insert_inode_locked(inode) < 0) { make_bad_inode(inode); - unlock_new_inode(inode); iput(inode); return ERR_PTR(-EINVAL); } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index ee57bac1ba6..3093ac4fb24 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info #ifndef __ECOS /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), * adding and jffs2_flash_read_end() interface. */ - if (c->mtd->point) { - err = c->mtd->point(c->mtd, ofs, len, &retlen, - (void **)&buffer, NULL); - if (!err && retlen < len) { - JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); - c->mtd->unpoint(c->mtd, ofs, retlen); - } else if (err) + err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL); + if (!err && retlen < len) { + JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); + mtd_unpoint(c->mtd, ofs, retlen); + } else if (err) { + if (err != -EOPNOTSUPP) JFFS2_WARNING("MTD point failed: error code %d.\n", err); - else - pointed = 1; /* succefully pointed to device */ - } + } else + pointed = 1; /* succefully pointed to device */ #endif if (!pointed) { @@ -101,7 +99,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, ofs, len); + mtd_unpoint(c->mtd, ofs, len); #endif if (crc != tn->data_crc) { @@ -137,7 +135,7 @@ free_out: kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, ofs, len); + mtd_unpoint(c->mtd, ofs, len); #endif return err; } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 28107ca136e..f99464833bb 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -97,15 +97,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) size_t pointlen, try_size; if (c->mtd->point) { - ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, - (void **)&flashbuf, NULL); + ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); if (!ret && pointlen < c->mtd->size) { /* Don't muck about if it won't let us point to the whole flash */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); - c->mtd->unpoint(c->mtd, 0, pointlen); + mtd_unpoint(c->mtd, 0, pointlen); flashbuf = NULL; } - if (ret) + if (ret && ret != -EOPNOTSUPP) D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); } #endif @@ -273,7 +273,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) kfree(flashbuf); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, 0, c->mtd->size); + mtd_unpoint(c->mtd, 0, c->mtd->size); #endif kfree(s); return ret; @@ -455,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo if (jffs2_cleanmarker_oob(c)) { int ret; - if (c->mtd->block_isbad(c->mtd, jeb->offset)) + if (mtd_block_isbad(c->mtd, jeb->offset)) return BLK_STATE_BADBLOCK; ret = jffs2_check_nand_cleanmarker(c, jeb); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index e7e97445411..f2d96b5e64f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -45,7 +45,6 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); } @@ -97,9 +96,9 @@ static const char *jffs2_compr_name(unsigned int compr) } } -static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt) +static int jffs2_show_options(struct seq_file *s, struct dentry *root) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb); + struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb); struct jffs2_mount_opts *opts = &c->mount_opts; if (opts->override_compr) @@ -336,9 +335,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); - if (c->mtd->sync) - c->mtd->sync(c->mtd); - + mtd_sync(c->mtd); D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index b09e51d2f81..30e8f47e8a2 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf, size_t retlen; char *eccstr; - ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); + ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); if (ret && ret != -EUCLEAN && ret != -EBADMSG) { printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret); return ret; @@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) } /* Do the read... */ - ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf); + ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen, + buf); /* ECC recovered ? */ if ((ret == -EUCLEAN || ret == -EBADMSG) && @@ -413,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) if (breakme++ == 20) { printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs); breakme = 0; - c->mtd->write(c->mtd, ofs, towrite, &retlen, - brokenbuf); + mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf); ret = -EIO; } else #endif - ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, - rewrite_buf); + ret = mtd_write(c->mtd, ofs, towrite, &retlen, + rewrite_buf); if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { /* Argh. We tried. Really we did. */ @@ -619,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (breakme++ == 20) { printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs); breakme = 0; - c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, - brokenbuf); + mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, + brokenbuf); ret = -EIO; } else #endif - ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf); + ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, + &retlen, c->wbuf); if (ret) { printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret); @@ -861,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, v += wbuf_retlen; if (vlen >= c->wbuf_pagesize) { - ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen), - &wbuf_retlen, v); + ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen), + &wbuf_retlen, v); if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen)) goto outfile; @@ -948,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re int ret; if (!jffs2_is_writebuffered(c)) - return c->mtd->read(c->mtd, ofs, len, retlen, buf); + return mtd_read(c->mtd, ofs, len, retlen, buf); /* Read flash */ down_read(&c->wbuf_sem); - ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); + ret = mtd_read(c->mtd, ofs, len, retlen, buf); if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { if (ret == -EBADMSG) @@ -1031,7 +1032,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1074,7 +1075,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); + ret = mtd_read_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1100,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; ops.datbuf = NULL; - ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops); + ret = mtd_write_oob(c->mtd, jeb->offset, &ops); if (ret || ops.oobretlen != ops.ooblen) { printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" " bytes, read %zd bytes, error %d\n", @@ -1129,11 +1130,8 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock * if( ++jeb->bad_count < MAX_ERASE_FAILURES) return 0; - if (!c->mtd->block_markbad) - return 1; // What else can we do? - printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset); - ret = c->mtd->block_markbad(c->mtd, bad_offset); + ret = mtd_block_markbad(c->mtd, bad_offset); if (ret) { D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret)); diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c index b9276b11bac..a1bda9dab3f 100644 --- a/fs/jffs2/writev.c +++ b/fs/jffs2/writev.c @@ -13,30 +13,6 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -/* This ought to be in core MTD code. All registered MTD devices - without writev should have this put in place. Bug the MTD - maintainer */ -static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs, - unsigned long count, loff_t to, size_t *retlen) -{ - unsigned long i; - size_t totlen = 0, thislen; - int ret = 0; - - for (i=0; i<count; i++) { - if (!vecs[i].iov_len) - continue; - ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base); - totlen += thislen; - if (ret || thislen != vecs[i].iov_len) - break; - to += vecs[i].iov_len; - } - if (retlen) - *retlen = totlen; - return ret; -} - int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen) { @@ -50,18 +26,14 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, } } - if (c->mtd->writev) - return c->mtd->writev(c->mtd, vecs, count, to, retlen); - else { - return mtd_fake_writev(c->mtd, vecs, count, to, retlen); - } + return mtd_writev(c->mtd, vecs, count, to, retlen); } int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf) { int ret; - ret = c->mtd->write(c->mtd, ofs, len, retlen, buf); + ret = mtd_write(c->mtd, ofs, len, retlen, buf); if (jffs2_sum_active()) { struct kvec vecs[1]; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 6f98a186677..f19d1e04a37 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -68,7 +68,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) unsigned int oldflags; int err; - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) return err; @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return err; } default: diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cc5f811ed38..2eb952c41a6 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2349,7 +2349,7 @@ int jfsIOWait(void *arg) if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index af9606057dd..bb8b661bcc5 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2800,7 +2800,7 @@ int jfs_lazycommit(void *arg) if (freezing(current)) { LAZY_UNLOCK(flags); - refrigerator(); + try_to_freeze(); } else { DECLARE_WAITQUEUE(wq, current); @@ -2994,7 +2994,7 @@ int jfs_sync(void *arg) if (freezing(current)) { TXN_UNLOCK(); - refrigerator(); + try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a112ad96e47..5f7c160ea64 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -72,7 +72,7 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, +static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int rc = 0; @@ -205,7 +205,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, * note: * EACCESS: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) +static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -1353,7 +1353,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, * FUNCTION: Create a special file (device) */ static int jfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; struct btstack btstack; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index a44eff076c1..682bca642f3 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -119,7 +119,6 @@ static void jfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct jfs_inode_info *ji = JFS_IP(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(jfs_inode_cachep, ji); } @@ -609,9 +608,9 @@ static int jfs_sync_fs(struct super_block *sb, int wait) return 0; } -static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int jfs_show_options(struct seq_file *seq, struct dentry *root) { - struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb); + struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); if (sbi->uid != -1) seq_printf(seq, ",uid=%d", sbi->uid); diff --git a/fs/libfs.c b/fs/libfs.c index f6d411eef1e..5b2dbb3ba4f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/exportfs.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> +#include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <asm/uaccess.h> diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 23d7451b293..65ba36b80a9 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(nsm_lock); * Local NSM state */ u32 __read_mostly nsm_local_state; -int __read_mostly nsm_use_hostnames; +bool __read_mostly nsm_use_hostnames; static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) { diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 1ca0679c80b..2240d384d78 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -403,7 +403,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == file->f_file->f_path.mnt->mnt_sb; + return sb == file->f_file->f_path.dentry->d_sb; } /** diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 339e17e9133..9c501449450 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -13,13 +13,14 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len, + void *buf) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; size_t retlen; int ret; - ret = mtd->read(mtd, ofs, len, &retlen, buf); + ret = mtd_read(mtd, ofs, len, &retlen, buf); BUG_ON(ret == -EINVAL); if (ret) return ret; @@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) return 0; } -static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len, + void *buf) { struct logfs_super *super = logfs_super(sb); struct mtd_info *mtd = super->s_mtd; @@ -47,7 +49,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) BUG_ON(len > PAGE_CACHE_SIZE); page_start = ofs & PAGE_CACHE_MASK; page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; - ret = mtd->write(mtd, ofs, len, &retlen, buf); + ret = mtd_write(mtd, ofs, len, &retlen, buf); if (ret || (retlen != len)) return -EIO; @@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) * asynchronous properties. So just to prevent the first implementor of such * a thing from breaking logfs in 2350, we do the usual pointless dance to * declare a completion variable and wait for completion before returning - * from mtd_erase(). What an exercise in futility! + * from logfs_mtd_erase(). What an exercise in futility! */ static void logfs_erase_callback(struct erase_info *ei) { complete((struct completion *)ei->priv); } -static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs, + size_t len) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; @@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) return 0; } -static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, +static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len, int ensure_write) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; @@ -102,30 +105,29 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, ei.len = len; ei.callback = logfs_erase_callback; ei.priv = (long)&complete; - ret = mtd->erase(mtd, &ei); + ret = mtd_erase(mtd, &ei); if (ret) return -EIO; wait_for_completion(&complete); if (ei.state != MTD_ERASE_DONE) return -EIO; - return mtd_erase_mapping(sb, ofs, len); + return logfs_mtd_erase_mapping(sb, ofs, len); } -static void mtd_sync(struct super_block *sb) +static void logfs_mtd_sync(struct super_block *sb) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; - if (mtd->sync) - mtd->sync(mtd); + mtd_sync(mtd); } -static int mtd_readpage(void *_sb, struct page *page) +static int logfs_mtd_readpage(void *_sb, struct page *page) { struct super_block *sb = _sb; int err; - err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, page_address(page)); if (err == -EUCLEAN || err == -EBADMSG) { /* -EBADMSG happens regularly on power failures */ @@ -143,18 +145,15 @@ static int mtd_readpage(void *_sb, struct page *page) return err; } -static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = mtd_readpage; + filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) - return NULL; - *ofs = 0; - while (mtd->block_isbad(mtd, *ofs)) { + while (mtd_block_isbad(mtd, *ofs)) { *ofs += mtd->erasesize; if (*ofs >= mtd->size) return NULL; @@ -163,18 +162,15 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); } -static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = mtd_readpage; + filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) - return NULL; - *ofs = mtd->size - mtd->erasesize; - while (mtd->block_isbad(mtd, *ofs)) { + while (mtd_block_isbad(mtd, *ofs)) { *ofs -= mtd->erasesize; if (*ofs <= 0) return NULL; @@ -184,7 +180,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); } -static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, +static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, size_t nr_pages) { struct logfs_super *super = logfs_super(sb); @@ -196,8 +192,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, page = find_lock_page(mapping, index + i); BUG_ON(!page); - err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, - page_address(page)); + err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); unlock_page(page); page_cache_release(page); if (err) @@ -206,7 +202,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, return 0; } -static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) { struct logfs_super *super = logfs_super(sb); int head; @@ -227,15 +223,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) len += head; } len = PAGE_ALIGN(len); - __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); } -static void mtd_put_device(struct logfs_super *s) +static void logfs_mtd_put_device(struct logfs_super *s) { put_mtd_device(s->s_mtd); } -static int mtd_can_write_buf(struct super_block *sb, u64 ofs) +static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs) { struct logfs_super *super = logfs_super(sb); void *buf; @@ -244,7 +240,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs) buf = kmalloc(super->s_writesize, GFP_KERNEL); if (!buf) return -ENOMEM; - err = mtd_read(sb, ofs, super->s_writesize, buf); + err = logfs_mtd_read(sb, ofs, super->s_writesize, buf); if (err) goto out; if (memchr_inv(buf, 0xff, super->s_writesize)) @@ -255,14 +251,14 @@ out: } static const struct logfs_device_ops mtd_devops = { - .find_first_sb = mtd_find_first_sb, - .find_last_sb = mtd_find_last_sb, - .readpage = mtd_readpage, - .writeseg = mtd_writeseg, - .erase = mtd_erase, - .can_write_buf = mtd_can_write_buf, - .sync = mtd_sync, - .put_device = mtd_put_device, + .find_first_sb = logfs_mtd_find_first_sb, + .find_last_sb = logfs_mtd_find_last_sb, + .readpage = logfs_mtd_readpage, + .writeseg = logfs_mtd_writeseg, + .erase = logfs_mtd_erase, + .can_write_buf = logfs_mtd_can_write_buf, + .sync = logfs_mtd_sync, + .put_device = logfs_mtd_put_device, }; int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b7d7f67cee5..3de7a32cadb 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, static int write_inode(struct inode *inode) { - return __logfs_write_inode(inode, WF_LOCK); + return __logfs_write_inode(inode, NULL, WF_LOCK); } static s64 dir_seek_data(struct inode *inode, s64 pos) @@ -482,7 +482,7 @@ out: return ret; } -static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -501,7 +501,7 @@ static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return __logfs_create(dir, dentry, inode, NULL, 0); } -static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -517,7 +517,7 @@ static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, return __logfs_create(dir, dentry, inode, NULL, 0); } -static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index b548c87a86f..3886cded283 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); + logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); mutex_unlock(&inode->i_mutex); return 0; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index caa4419285d..d4efb061bdc 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb) int i, max_dist; struct gc_candidate *cand = NULL, *this; - max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); for (i = max_dist; i >= 0; i--) { this = first_in_list(&super->s_low_list[i]); diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 7e441ad5f79..a422f42238b 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -144,7 +144,6 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) static void logfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); } @@ -287,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) return 0; - ret = __logfs_write_inode(inode, flags); + ret = __logfs_write_inode(inode, NULL, flags); LOGFS_BUG_ON(ret, inode->i_sb); return ret; } @@ -324,7 +323,7 @@ static void logfs_set_ino_generation(struct super_block *sb, mutex_unlock(&super->s_journal_mutex); } -struct inode *logfs_new_inode(struct inode *dir, int mode) +struct inode *logfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -364,7 +363,9 @@ static void logfs_init_once(void *_li) static int logfs_sync_fs(struct super_block *sb, int wait) { + logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); return 0; } diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 9da29706f91..1e1c369df22 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, if (len == 0) return logfs_write_header(super, header, 0, type); - BUG_ON(len > sb->s_blocksize); compr_len = logfs_compress(buf, data, len, sb->s_blocksize); if (compr_len < 0 || type == JE_ANCHOR) { memcpy(data, buf, len); diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 398ecff6e54..5f093760946 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -520,7 +520,7 @@ extern const struct super_operations logfs_super_operations; struct inode *logfs_iget(struct super_block *sb, ino_t ino); struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); void logfs_safe_iput(struct inode *inode, int cookie); -struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_inode(struct inode *dir, umode_t mode); struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); int logfs_init_inode_cache(void); @@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void); void logfs_set_blocks(struct inode *inode, u64 no); /* these logically belong into inode.c but actually reside in readwrite.c */ int logfs_read_inode(struct inode *inode); -int __logfs_write_inode(struct inode *inode, long flags); +int __logfs_write_inode(struct inode *inode, struct page *, long flags); void logfs_evict_inode(struct inode *inode); /* journal.c */ @@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block, __be64 *array, int page_is_empty); int logfs_exist_block(struct inode *inode, u64 bix); int get_page_reserve(struct inode *inode, struct page *page); +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); extern struct logfs_block_ops indirect_block_ops; /* segment.c */ @@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb); void logfs_sync_area(struct logfs_area *area); void logfs_sync_segments(struct super_block *sb); void freeseg(struct super_block *sb, u32 segno); +void free_areas(struct super_block *sb); /* area handling */ int logfs_init_areas(struct super_block *sb); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 2ac4217b790..4153e65b014 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock) * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked * in addition to PG_locked. */ -static void logfs_get_wblocks(struct super_block *sb, struct page *page, - int lock) +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock) { struct logfs_super *super = logfs_super(sb); @@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page, } } -static void logfs_put_wblocks(struct super_block *sb, struct page *page, - int lock) +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock) { struct logfs_super *super = logfs_super(sb); @@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block) if (inode->i_ino == LOGFS_INO_MASTER) logfs_write_anchor(inode->i_sb); else { - ret = __logfs_write_inode(inode, 0); + ret = __logfs_write_inode(inode, NULL, 0); /* see indirect_write_block comment */ BUG_ON(ret); } @@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block) static void indirect_free_block(struct super_block *sb, struct logfs_block *block) { - ClearPagePrivate(block->page); - block->page->private = 0; + struct page *page = block->page; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } __free_block(sb, block); } @@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page) logfs_unpack_index(page->index, &bix, &level); block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); block->page = page; + SetPagePrivate(page); - page->private = (unsigned long)block; + page_cache_get(page); + set_page_private(page, (unsigned long) block); + block->ops = &indirect_block_ops; } @@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags) static int __logfs_delete(struct inode *inode, struct page *page) { long flags = WF_DELETE; + int err; inode->i_ctime = inode->i_mtime = CURRENT_TIME; if (page->index < I0_BLOCKS) return logfs_write_direct(inode, page, flags); + err = grow_inode(inode, page->index, 0); + if (err) + return err; return logfs_write_rec(inode, page, page->index, 0, flags); } @@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, if (inode->i_ino == LOGFS_INO_MASTER) logfs_write_anchor(inode->i_sb); else { - err = __logfs_write_inode(inode, flags); + err = __logfs_write_inode(inode, page, flags); } } } @@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target) logfs_get_wblocks(sb, NULL, 1); err = __logfs_truncate(inode, size); if (!err) - err = __logfs_write_inode(inode, 0); + err = __logfs_write_inode(inode, NULL, 0); logfs_put_wblocks(sb, NULL, 1); } @@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page) li->li_block = block; block->page = NULL; - page->private = 0; - ClearPagePrivate(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } } static void move_inode_to_page(struct page *page, struct inode *inode) @@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode) BUG_ON(PagePrivate(page)); block->ops = &indirect_block_ops; block->page = page; - page->private = (unsigned long)block; - SetPagePrivate(page); + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } block->inode = NULL; li->li_block = NULL; @@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) ec_level); } -int __logfs_write_inode(struct inode *inode, long flags) +int __logfs_write_inode(struct inode *inode, struct page *page, long flags) { struct super_block *sb = inode->i_sb; int ret; - logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + logfs_get_wblocks(sb, page, flags & WF_LOCK); ret = do_write_inode(inode); - logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + logfs_put_wblocks(sb, page, flags & WF_LOCK); return ret; } diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 9d518735325..ab798ed1cc8 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, BUG_ON(!page); /* FIXME: reserve a pool */ SetPageUptodate(page); memcpy(page_address(page) + offset, buf, copylen); - SetPagePrivate(page); + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); buf += copylen; @@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area) page = get_mapping_page(sb, index, 0); BUG_ON(!page); /* FIXME: reserve a pool */ memset(page_address(page) + offset, 0xff, len); - SetPagePrivate(page); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); } } @@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area) BUG_ON(!page); /* FIXME: reserve a pool */ SetPageUptodate(page); memset(page_address(page), 0xff, PAGE_CACHE_SIZE); - SetPagePrivate(page); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); index++; no_indizes--; @@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page, mempool_free(item, super->s_alias_pool); } block->page = page; - SetPagePrivate(page); - page->private = (unsigned long)block; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } block->ops = &indirect_block_ops; initialize_block_counters(page, block, data, 0); } @@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page) list_add(&item->list, &block->item_list); } block->page = NULL; - ClearPagePrivate(page); - page->private = 0; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } block->ops = &btree_block_ops; err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, block); @@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno) page = find_get_page(mapping, ofs >> PAGE_SHIFT); if (!page) continue; - ClearPagePrivate(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + } page_cache_release(page); } } @@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area) kfree(area); } +void free_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); +} + static struct logfs_area *alloc_area(struct super_block *sb) { struct logfs_area *area; @@ -923,10 +954,6 @@ err: void logfs_cleanup_areas(struct super_block *sb) { struct logfs_super *super = logfs_super(sb); - int i; btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); - for_each_area(i) - free_area(super->s_area[i]); - free_area(super->s_journal_area); } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index e795c234ea3..c9ee7f5d1ca 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -486,14 +486,15 @@ static void logfs_kill_sb(struct super_block *sb) /* Alias entries slow down mount, so evict as many as possible */ sync_filesystem(sb); logfs_write_anchor(sb); + free_areas(sb); /* * From this point on alias entries are simply dropped - and any * writes to the object store are considered bugs. */ - super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; log_super("LogFS: Now in shutdown\n"); generic_shutdown_super(sb); + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index ef175cb8cfd..4bc50dac8e9 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -209,7 +209,7 @@ void minix_free_inode(struct inode * inode) mark_buffer_dirty(bh); } -struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) +struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 4d46a6a5907..fa8b612b8ce 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -71,7 +71,6 @@ static struct inode *minix_alloc_inode(struct super_block *sb) static void minix_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(minix_inode_cachep, minix_i(inode)); } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 26bbd55e82e..c889ef0aa57 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -46,7 +46,7 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, int, int *); +extern struct inode * minix_new_inode(const struct inode *, umode_t, int *); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 6e6777f1b4b..2f76e38c206 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -36,7 +36,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st return NULL; } -static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int error; struct inode *inode; @@ -54,7 +54,7 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_ return error; } -static int minix_create(struct inode * dir, struct dentry *dentry, int mode, +static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return minix_mknod(dir, dentry, mode, 0); @@ -103,7 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) +static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/mount.h b/fs/mount.h new file mode 100644 index 00000000000..4ef36d93e5a --- /dev/null +++ b/fs/mount.h @@ -0,0 +1,76 @@ +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/poll.h> + +struct mnt_namespace { + atomic_t count; + struct mount * root; + struct list_head list; + wait_queue_head_t poll; + int event; +}; + +struct mnt_pcp { + int mnt_count; + int mnt_writers; +}; + +struct mount { + struct list_head mnt_hash; + struct mount *mnt_parent; + struct dentry *mnt_mountpoint; + struct vfsmount mnt; +#ifdef CONFIG_SMP + struct mnt_pcp __percpu *mnt_pcp; + atomic_t mnt_longterm; /* how many of the refs are longterm */ +#else + int mnt_count; + int mnt_writers; +#endif + struct list_head mnt_mounts; /* list of children, anchored here */ + struct list_head mnt_child; /* and going through their mnt_child */ + struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ + struct list_head mnt_list; + struct list_head mnt_expire; /* link in fs-specific expiry list */ + struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_slave_list;/* list of slave mounts */ + struct list_head mnt_slave; /* slave list entry */ + struct mount *mnt_master; /* slave is on master->mnt_slave_list */ + struct mnt_namespace *mnt_ns; /* containing namespace */ +#ifdef CONFIG_FSNOTIFY + struct hlist_head mnt_fsnotify_marks; + __u32 mnt_fsnotify_mask; +#endif + int mnt_id; /* mount identifier */ + int mnt_group_id; /* peer group identifier */ + int mnt_expiry_mark; /* true if marked for expiry */ + int mnt_pinned; + int mnt_ghosts; +}; + +static inline struct mount *real_mount(struct vfsmount *mnt) +{ + return container_of(mnt, struct mount, mnt); +} + +static inline int mnt_has_parent(struct mount *mnt) +{ + return mnt != mnt->mnt_parent; +} + +extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int); + +static inline void get_mnt_ns(struct mnt_namespace *ns) +{ + atomic_inc(&ns->count); +} + +struct proc_mounts { + struct seq_file m; /* must be the first element */ + struct mnt_namespace *ns; + struct path root; + int (*show)(struct seq_file *, struct vfsmount *); +}; + +extern const struct seq_operations mounts_op; diff --git a/fs/mpage.c b/fs/mpage.c index fdfae9fa98c..643e9f55ef2 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - struct blk_plug plug; - - blk_start_plug(&plug); map_bh.b_state = 0; map_bh.b_size = 0; @@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, BUG_ON(!list_empty(pages)); if (bio) mpage_bio_submit(READ, bio); - blk_finish_plug(&plug); return 0; } EXPORT_SYMBOL(mpage_readpages); diff --git a/fs/namei.c b/fs/namei.c index 5008f01787f..46ea9cc1664 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -36,6 +36,7 @@ #include <asm/uaccess.h> #include "internal.h" +#include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -139,21 +140,19 @@ static int do_getname(const char __user *filename, char *page) static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *tmp, *result; + char *result = __getname(); + int retval; - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - int retval = do_getname(filename, tmp); + if (!result) + return ERR_PTR(-ENOMEM); - result = tmp; - if (retval < 0) { - if (retval == -ENOENT && empty) - *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(tmp); - result = ERR_PTR(retval); - } + retval = do_getname(filename, result); + if (retval < 0) { + if (retval == -ENOENT && empty) + *empty = 1; + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(retval); } } audit_getname(result); @@ -676,36 +675,38 @@ follow_link(struct path *link, struct nameidata *nd, void **p) static int follow_up_rcu(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; - parent = path->mnt->mnt_parent; - if (parent == path->mnt) + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) return 0; - mountpoint = path->mnt->mnt_mountpoint; + mountpoint = mnt->mnt_mountpoint; path->dentry = mountpoint; - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } int follow_up(struct path *path) { - struct vfsmount *parent; + struct mount *mnt = real_mount(path->mnt); + struct mount *parent; struct dentry *mountpoint; br_read_lock(vfsmount_lock); - parent = path->mnt->mnt_parent; - if (parent == path->mnt) { + parent = mnt->mnt_parent; + if (&parent->mnt == path->mnt) { br_read_unlock(vfsmount_lock); return 0; } - mntget(parent); - mountpoint = dget(path->mnt->mnt_mountpoint); + mntget(&parent->mnt); + mountpoint = dget(mnt->mnt_mountpoint); br_read_unlock(vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); - path->mnt = parent; + path->mnt = &parent->mnt; return 1; } @@ -884,7 +885,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode) { for (;;) { - struct vfsmount *mounted; + struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. @@ -898,8 +899,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; - path->mnt = mounted; - path->dentry = mounted->mnt_root; + path->mnt = &mounted->mnt; + path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* @@ -915,12 +916,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static void follow_mount_rcu(struct nameidata *nd) { while (d_mountpoint(nd->path.dentry)) { - struct vfsmount *mounted; + struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); if (!mounted) break; - nd->path.mnt = mounted; - nd->path.dentry = mounted->mnt_root; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } } @@ -1094,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) + if (unlikely(IS_DEADDIR(inode))) { + dput(dentry); return ERR_PTR(-ENOENT); + } old = inode->i_op->lookup(inode, dentry, nd); if (unlikely(old)) { @@ -1371,6 +1374,34 @@ static inline int can_lookup(struct inode *inode) return 1; } +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(*name++, hash); + return end_name_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +/* + * We know there's a real path component here of at least + * one character. + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + do { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } while (c && c != '/'); + *hashp = end_name_hash(hash); + return len; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1391,31 +1422,22 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - unsigned long hash; struct qstr this; - unsigned int c; + long len; int type; err = may_lookup(nd); if (err) break; + len = hash_name(name, &this.hash); this.name = name; - c = *(const unsigned char *)name; - - hash = init_name_hash(); - do { - name++; - hash = partial_name_hash(c, hash); - c = *(const unsigned char *)name; - } while (c && (c != '/')); - this.len = name - (const char *) this.name; - this.hash = end_name_hash(hash); + this.len = len; type = LAST_NORM; - if (this.name[0] == '.') switch (this.len) { + if (name[0] == '.') switch (len) { case 2: - if (this.name[1] == '.') { + if (name[1] == '.') { type = LAST_DOTDOT; nd->flags |= LOOKUP_JUMPED; } @@ -1434,12 +1456,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) } } - /* remove trailing slashes? */ - if (!c) + if (!name[len]) goto last_component; - while (*++name == '/'); - if (!*name) + /* + * If it wasn't NUL, we know it was '/'. Skip that + * slash, and continue until no more slashes. + */ + do { + len++; + } while (unlikely(name[len] == '/')); + if (!name[len]) goto last_component; + name += len; err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) @@ -1772,24 +1800,21 @@ static struct dentry *lookup_hash(struct nameidata *nd) struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; - unsigned long hash; unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); this.name = name; this.len = len; + this.hash = full_name_hash(name, len); if (!len) return ERR_PTR(-EACCES); - hash = init_name_hash(); while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return ERR_PTR(-EACCES); - hash = partial_name_hash(c, hash); } - this.hash = end_name_hash(hash); /* * See if the low-level filesystem might want * to use its own hash.. @@ -1976,7 +2001,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } } -int vfs_create(struct inode *dir, struct dentry *dentry, int mode, +int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int error = may_create(dir, dentry); @@ -2137,7 +2162,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* sayonara */ error = complete_walk(nd); if (error) - return ERR_PTR(-ECHILD); + return ERR_PTR(error); error = -ENOTDIR; if (nd->flags & LOOKUP_DIRECTORY) { @@ -2177,7 +2202,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Negative dentry, just create the file */ if (!dentry->d_inode) { - int mode = op->mode; + umode_t mode = op->mode; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); /* @@ -2236,7 +2261,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) - goto exit; + return ERR_PTR(error); error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; @@ -2444,7 +2469,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct pat } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int error = may_create(dir, dentry); @@ -2472,7 +2497,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } -static int may_mknod(mode_t mode) +static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: @@ -2489,7 +2514,7 @@ static int may_mknod(mode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned, dev) { struct dentry *dentry; @@ -2536,12 +2561,12 @@ out_dput: return error; } -SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev) +SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return sys_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error = may_create(dir, dentry); @@ -2562,7 +2587,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return error; } -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { struct dentry *dentry; struct path path; @@ -2590,7 +2615,7 @@ out_dput: return error; } -SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return sys_mkdirat(AT_FDCWD, pathname, mode); } diff --git a/fs/namespace.c b/fs/namespace.c index cfc6d4448aa..e6081996c9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -9,30 +9,17 @@ */ #include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/spinlock.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/acct.h> +#include <linux/export.h> #include <linux/capability.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/sysfs.h> -#include <linux/seq_file.h> #include <linux/mnt_namespace.h> #include <linux/namei.h> -#include <linux/nsproxy.h> #include <linux/security.h> -#include <linux/mount.h> -#include <linux/ramfs.h> -#include <linux/log2.h> #include <linux/idr.h> -#include <linux/fs_struct.h> -#include <linux/fsnotify.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/acct.h> /* acct_auto_close_mnt */ +#include <linux/ramfs.h> /* init_rootfs */ +#include <linux/fs_struct.h> /* get_fs_root et.al. */ +#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ +#include <linux/uaccess.h> #include "pnode.h" #include "internal.h" @@ -78,7 +65,7 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) * allocation is serialized by namespace_sem, but we need the spinlock to * serialize with freeing. */ -static int mnt_alloc_id(struct vfsmount *mnt) +static int mnt_alloc_id(struct mount *mnt) { int res; @@ -95,7 +82,7 @@ retry: return res; } -static void mnt_free_id(struct vfsmount *mnt) +static void mnt_free_id(struct mount *mnt) { int id = mnt->mnt_id; spin_lock(&mnt_id_lock); @@ -110,7 +97,7 @@ static void mnt_free_id(struct vfsmount *mnt) * * mnt_group_ida is protected by namespace_sem */ -static int mnt_alloc_group_id(struct vfsmount *mnt) +static int mnt_alloc_group_id(struct mount *mnt) { int res; @@ -129,7 +116,7 @@ static int mnt_alloc_group_id(struct vfsmount *mnt) /* * Release a peer group ID */ -void mnt_release_group_id(struct vfsmount *mnt) +void mnt_release_group_id(struct mount *mnt) { int id = mnt->mnt_group_id; ida_remove(&mnt_group_ida, id); @@ -141,7 +128,7 @@ void mnt_release_group_id(struct vfsmount *mnt) /* * vfsmount lock must be held for read */ -static inline void mnt_add_count(struct vfsmount *mnt, int n) +static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); @@ -152,35 +139,10 @@ static inline void mnt_add_count(struct vfsmount *mnt, int n) #endif } -static inline void mnt_set_count(struct vfsmount *mnt, int n) -{ -#ifdef CONFIG_SMP - this_cpu_write(mnt->mnt_pcp->mnt_count, n); -#else - mnt->mnt_count = n; -#endif -} - -/* - * vfsmount lock must be held for read - */ -static inline void mnt_inc_count(struct vfsmount *mnt) -{ - mnt_add_count(mnt, 1); -} - -/* - * vfsmount lock must be held for read - */ -static inline void mnt_dec_count(struct vfsmount *mnt) -{ - mnt_add_count(mnt, -1); -} - /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct vfsmount *mnt) +unsigned int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; @@ -196,9 +158,9 @@ unsigned int mnt_get_count(struct vfsmount *mnt) #endif } -static struct vfsmount *alloc_vfsmnt(const char *name) +static struct mount *alloc_vfsmnt(const char *name) { - struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; @@ -277,7 +239,7 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void mnt_inc_writers(struct vfsmount *mnt) +static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); @@ -286,7 +248,7 @@ static inline void mnt_inc_writers(struct vfsmount *mnt) #endif } -static inline void mnt_dec_writers(struct vfsmount *mnt) +static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); @@ -295,7 +257,7 @@ static inline void mnt_dec_writers(struct vfsmount *mnt) #endif } -static unsigned int mnt_get_writers(struct vfsmount *mnt) +static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; @@ -311,6 +273,15 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) #endif } +static int mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). @@ -321,7 +292,7 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) */ /** * mnt_want_write - get write access to a mount - * @mnt: the mount on which to take a write + * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is * about to be performed to it, and makes sure that @@ -329,8 +300,9 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt) * the write operation is finished, mnt_drop_write() * must be called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *mnt) +int mnt_want_write(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); @@ -341,7 +313,7 @@ int mnt_want_write(struct vfsmount *mnt) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) + while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -349,12 +321,10 @@ int mnt_want_write(struct vfsmount *mnt) * MNT_WRITE_HOLD is cleared. */ smp_rmb(); - if (__mnt_is_readonly(mnt)) { + if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; - goto out; } -out: preempt_enable(); return ret; } @@ -378,7 +348,7 @@ int mnt_clone_write(struct vfsmount *mnt) if (__mnt_is_readonly(mnt)) return -EROFS; preempt_disable(); - mnt_inc_writers(mnt); + mnt_inc_writers(real_mount(mnt)); preempt_enable(); return 0; } @@ -412,17 +382,23 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file); void mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); - mnt_dec_writers(mnt); + mnt_dec_writers(real_mount(mnt)); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); -static int mnt_make_readonly(struct vfsmount *mnt) +void mnt_drop_write_file(struct file *file) +{ + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); + +static int mnt_make_readonly(struct mount *mnt) { int ret = 0; br_write_lock(vfsmount_lock); - mnt->mnt_flags |= MNT_WRITE_HOLD; + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. @@ -448,25 +424,61 @@ static int mnt_make_readonly(struct vfsmount *mnt) if (mnt_get_writers(mnt) > 0) ret = -EBUSY; else - mnt->mnt_flags |= MNT_READONLY; + mnt->mnt.mnt_flags |= MNT_READONLY; /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt_flags &= ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; br_write_unlock(vfsmount_lock); return ret; } -static void __mnt_unmake_readonly(struct vfsmount *mnt) +static void __mnt_unmake_readonly(struct mount *mnt) { br_write_lock(vfsmount_lock); - mnt->mnt_flags &= ~MNT_READONLY; + mnt->mnt.mnt_flags &= ~MNT_READONLY; + br_write_unlock(vfsmount_lock); +} + +int sb_prepare_remount_readonly(struct super_block *sb) +{ + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + br_write_lock(vfsmount_lock); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } br_write_unlock(vfsmount_lock); + + return err; } -static void free_vfsmnt(struct vfsmount *mnt) +static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); @@ -481,20 +493,20 @@ static void free_vfsmnt(struct vfsmount *mnt) * @dir. If @dir is set return the first mount else return the last mount. * vfsmount_lock must be held for read or write. */ -struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, int dir) { struct list_head *head = mount_hashtable + hash(mnt, dentry); struct list_head *tmp = head; - struct vfsmount *p, *found = NULL; + struct mount *p, *found = NULL; for (;;) { tmp = dir ? tmp->next : tmp->prev; p = NULL; if (tmp == head) break; - p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + p = list_entry(tmp, struct mount, mnt_hash); + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { found = p; break; } @@ -508,16 +520,21 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, */ struct vfsmount *lookup_mnt(struct path *path) { - struct vfsmount *child_mnt; + struct mount *child_mnt; br_read_lock(vfsmount_lock); - if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) - mntget(child_mnt); - br_read_unlock(vfsmount_lock); - return child_mnt; + child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); + if (child_mnt) { + mnt_add_count(child_mnt, 1); + br_read_unlock(vfsmount_lock); + return &child_mnt->mnt; + } else { + br_read_unlock(vfsmount_lock); + return NULL; + } } -static inline int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } @@ -548,12 +565,12 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) * Clear dentry's mounted state if it has no remaining mounts. * vfsmount_lock must be held for write. */ -static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) +static void dentry_reset_mounted(struct dentry *dentry) { unsigned u; for (u = 0; u < HASH_SIZE; u++) { - struct vfsmount *p; + struct mount *p; list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { if (p->mnt_mountpoint == dentry) @@ -568,25 +585,26 @@ static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry) /* * vfsmount lock must be held for write */ -static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +static void detach_mnt(struct mount *mnt, struct path *old_path) { old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = mnt->mnt_parent; + old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - dentry_reset_mounted(old_path->mnt, old_path->dentry); + dentry_reset_mounted(old_path->dentry); } /* * vfsmount lock must be held for write */ -void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, - struct vfsmount *child_mnt) +void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, + struct mount *child_mnt) { - child_mnt->mnt_parent = mntget(mnt); + mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = dget(dentry); + child_mnt->mnt_parent = mnt; spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -595,15 +613,15 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, /* * vfsmount lock must be held for write */ -static void attach_mnt(struct vfsmount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, struct path *path) { - mnt_set_mountpoint(path->mnt, path->dentry, mnt); + mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); + list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); } -static inline void __mnt_make_longterm(struct vfsmount *mnt) +static inline void __mnt_make_longterm(struct mount *mnt) { #ifdef CONFIG_SMP atomic_inc(&mnt->mnt_longterm); @@ -611,7 +629,7 @@ static inline void __mnt_make_longterm(struct vfsmount *mnt) } /* needs vfsmount lock for write */ -static inline void __mnt_make_shortterm(struct vfsmount *mnt) +static inline void __mnt_make_shortterm(struct mount *mnt) { #ifdef CONFIG_SMP atomic_dec(&mnt->mnt_longterm); @@ -621,10 +639,10 @@ static inline void __mnt_make_shortterm(struct vfsmount *mnt) /* * vfsmount lock must be held for write */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct mount *mnt) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; @@ -639,12 +657,12 @@ static void commit_tree(struct vfsmount *mnt) list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(parent, mnt->mnt_mountpoint)); + hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { @@ -657,14 +675,14 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) p = p->mnt_parent; } } - return list_entry(next, struct vfsmount, mnt_child); + return list_entry(next, struct mount, mnt_child); } -static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { - p = list_entry(prev, struct vfsmount, mnt_child); + p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; @@ -673,7 +691,7 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p) struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { - struct vfsmount *mnt; + struct mount *mnt; struct dentry *root; if (!type) @@ -684,7 +702,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void return ERR_PTR(-ENOMEM); if (flags & MS_KERNMOUNT) - mnt->mnt_flags = MNT_INTERNAL; + mnt->mnt.mnt_flags = MNT_INTERNAL; root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { @@ -692,19 +710,22 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void return ERR_CAST(root); } - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt.mnt_root = root; + mnt->mnt.mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - return mnt; + br_write_lock(vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + br_write_unlock(vfsmount_lock); + return &mnt->mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); -static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, +static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt_sb; - struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt = alloc_vfsmnt(old->mnt_devname); if (mnt) { if (flag & (CL_SLAVE | CL_PRIVATE)) @@ -718,12 +739,15 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, goto out_free; } - mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; atomic_inc(&sb->s_active); - mnt->mnt_sb = sb; - mnt->mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + br_write_lock(vfsmount_lock); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + br_write_unlock(vfsmount_lock); if (flag & CL_SLAVE) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); @@ -753,9 +777,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, return NULL; } -static inline void mntfree(struct vfsmount *mnt) +static inline void mntfree(struct mount *mnt) { - struct super_block *sb = mnt->mnt_sb; + struct vfsmount *m = &mnt->mnt; + struct super_block *sb = m->mnt_sb; /* * This probably indicates that somebody messed @@ -768,32 +793,32 @@ static inline void mntfree(struct vfsmount *mnt) * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); - fsnotify_vfsmount_delete(mnt); - dput(mnt->mnt_root); + fsnotify_vfsmount_delete(m); + dput(m->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); } -static void mntput_no_expire(struct vfsmount *mnt) +static void mntput_no_expire(struct mount *mnt) { put_again: #ifdef CONFIG_SMP br_read_lock(vfsmount_lock); if (likely(atomic_read(&mnt->mnt_longterm))) { - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); br_read_unlock(vfsmount_lock); return; } br_read_unlock(vfsmount_lock); br_write_lock(vfsmount_lock); - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { br_write_unlock(vfsmount_lock); return; } #else - mnt_dec_count(mnt); + mnt_add_count(mnt, -1); if (likely(mnt_get_count(mnt))) return; br_write_lock(vfsmount_lock); @@ -802,9 +827,10 @@ put_again: mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; br_write_unlock(vfsmount_lock); - acct_auto_close_mnt(mnt); + acct_auto_close_mnt(&mnt->mnt); goto put_again; } + list_del(&mnt->mnt_instance); br_write_unlock(vfsmount_lock); mntfree(mnt); } @@ -812,10 +838,11 @@ put_again: void mntput(struct vfsmount *mnt) { if (mnt) { + struct mount *m = real_mount(mnt); /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ - if (unlikely(mnt->mnt_expiry_mark)) - mnt->mnt_expiry_mark = 0; - mntput_no_expire(mnt); + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); @@ -823,7 +850,7 @@ EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) - mnt_inc_count(mnt); + mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); @@ -831,16 +858,17 @@ EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { br_write_lock(vfsmount_lock); - mnt->mnt_pinned++; + real_mount(mnt)->mnt_pinned++; br_write_unlock(vfsmount_lock); } EXPORT_SYMBOL(mnt_pin); -void mnt_unpin(struct vfsmount *mnt) +void mnt_unpin(struct vfsmount *m) { + struct mount *mnt = real_mount(m); br_write_lock(vfsmount_lock); if (mnt->mnt_pinned) { - mnt_inc_count(mnt); + mnt_add_count(mnt, 1); mnt->mnt_pinned--; } br_write_unlock(vfsmount_lock); @@ -858,12 +886,12 @@ static inline void mangle(struct seq_file *m, const char *s) * * See also save_mount_options(). */ -int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +int generic_show_options(struct seq_file *m, struct dentry *root) { const char *options; rcu_read_lock(); - options = rcu_dereference(mnt->mnt_sb->s_options); + options = rcu_dereference(root->d_sb->s_options); if (options != NULL && options[0]) { seq_putc(m, ','); @@ -907,10 +935,10 @@ void replace_mount_options(struct super_block *sb, char *options) EXPORT_SYMBOL(replace_mount_options); #ifdef CONFIG_PROC_FS -/* iterator */ +/* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); down_read(&namespace_sem); return seq_list_start(&p->ns->list, *pos); @@ -918,7 +946,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); return seq_list_next(v, &p->ns->list, pos); } @@ -928,219 +956,18 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -int mnt_had_events(struct proc_mounts *p) +static int m_show(struct seq_file *m, void *v) { - struct mnt_namespace *ns = p->ns; - int res = 0; - - br_read_lock(vfsmount_lock); - if (p->m.poll_event != ns->event) { - p->m.poll_event = ns->event; - res = 1; - } - br_read_unlock(vfsmount_lock); - - return res; -} - -struct proc_fs_info { - int flag; - const char *str; -}; - -static int show_sb_opts(struct seq_file *m, struct super_block *sb) -{ - static const struct proc_fs_info fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - - return security_sb_show_options(m, sb); -} - -static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) -{ - static const struct proc_fs_info mnt_info[] = { - { MNT_NOSUID, ",nosuid" }, - { MNT_NODEV, ",nodev" }, - { MNT_NOEXEC, ",noexec" }, - { MNT_NOATIME, ",noatime" }, - { MNT_NODIRATIME, ",nodiratime" }, - { MNT_RELATIME, ",relatime" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } -} - -static void show_type(struct seq_file *m, struct super_block *sb) -{ - mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, sb->s_subtype); - } -} - -static int show_vfsmnt(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - - if (mnt->mnt_sb->s_op->show_devname) { - err = mnt->mnt_sb->s_op->show_devname(m, mnt); - if (err) - goto out; - } else { - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - } - seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - show_type(m, mnt->mnt_sb); - seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - err = show_sb_opts(m, mnt->mnt_sb); - if (err) - goto out; - show_mnt_opts(m, mnt); - if (mnt->mnt_sb->s_op->show_options) - err = mnt->mnt_sb->s_op->show_options(m, mnt); - seq_puts(m, " 0 0\n"); -out: - return err; + struct proc_mounts *p = container_of(m, struct proc_mounts, m); + struct mount *r = list_entry(v, struct mount, mnt_list); + return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_vfsmnt -}; - -static int show_mountinfo(struct seq_file *m, void *v) -{ - struct proc_mounts *p = m->private; - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct super_block *sb = mnt->mnt_sb; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - struct path root = p->root; - int err = 0; - - seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, - MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) - err = sb->s_op->show_path(m, mnt); - else - seq_dentry(m, mnt->mnt_root, " \t\n\\"); - if (err) - goto out; - seq_putc(m, ' '); - - /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ - err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (err) - goto out; - - seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); - show_mnt_opts(m, mnt); - - /* Tagged fields ("foo:X" or "bar") */ - if (IS_MNT_SHARED(mnt)) - seq_printf(m, " shared:%i", mnt->mnt_group_id); - if (IS_MNT_SLAVE(mnt)) { - int master = mnt->mnt_master->mnt_group_id; - int dom = get_dominating_id(mnt, &p->root); - seq_printf(m, " master:%i", master); - if (dom && dom != master) - seq_printf(m, " propagate_from:%i", dom); - } - if (IS_MNT_UNBINDABLE(mnt)) - seq_puts(m, " unbindable"); - - /* Filesystem specific data */ - seq_puts(m, " - "); - show_type(m, sb); - seq_putc(m, ' '); - if (sb->s_op->show_devname) - err = sb->s_op->show_devname(m, mnt); - else - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - if (err) - goto out; - seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - err = show_sb_opts(m, sb); - if (err) - goto out; - if (sb->s_op->show_options) - err = sb->s_op->show_options(m, mnt); - seq_putc(m, '\n'); -out: - return err; -} - -const struct seq_operations mountinfo_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_mountinfo, -}; - -static int show_vfsstat(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; - - /* device */ - if (mnt->mnt_sb->s_op->show_devname) { - seq_puts(m, "device "); - err = mnt->mnt_sb->s_op->show_devname(m, mnt); - } else { - if (mnt->mnt_devname) { - seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); - } else - seq_puts(m, "no device"); - } - - /* mount point */ - seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - - /* file system type */ - seq_puts(m, "with fstype "); - show_type(m, mnt->mnt_sb); - - /* optional statistics */ - if (mnt->mnt_sb->s_op->show_stats) { - seq_putc(m, ' '); - if (!err) - err = mnt->mnt_sb->s_op->show_stats(m, mnt); - } - - seq_putc(m, '\n'); - return err; -} - -const struct seq_operations mountstats_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_vfsstat, + .show = m_show, }; #endif /* CONFIG_PROC_FS */ @@ -1152,11 +979,13 @@ const struct seq_operations mountstats_op = { * open files, pwds, chroots or sub mounts that are * busy. */ -int may_umount_tree(struct vfsmount *mnt) +int may_umount_tree(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; - struct vfsmount *p; + struct mount *p; + BUG_ON(!m); /* write lock needed for mnt_get_count */ br_write_lock(vfsmount_lock); @@ -1192,7 +1021,7 @@ int may_umount(struct vfsmount *mnt) int ret = 1; down_read(&namespace_sem); br_write_lock(vfsmount_lock); - if (propagate_mount_busy(mnt, 2)) + if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; br_write_unlock(vfsmount_lock); up_read(&namespace_sem); @@ -1203,25 +1032,25 @@ EXPORT_SYMBOL(may_umount); void release_mounts(struct list_head *head) { - struct vfsmount *mnt; + struct mount *mnt; while (!list_empty(head)) { - mnt = list_first_entry(head, struct vfsmount, mnt_hash); + mnt = list_first_entry(head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); - if (mnt->mnt_parent != mnt) { + if (mnt_has_parent(mnt)) { struct dentry *dentry; - struct vfsmount *m; + struct mount *m; br_write_lock(vfsmount_lock); dentry = mnt->mnt_mountpoint; m = mnt->mnt_parent; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; m->mnt_ghosts--; br_write_unlock(vfsmount_lock); dput(dentry); - mntput(m); + mntput(&m->mnt); } - mntput(mnt); + mntput(&mnt->mnt); } } @@ -1229,10 +1058,10 @@ void release_mounts(struct list_head *head) * vfsmount lock must be held for write * namespace_sem must be held for write */ -void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) { LIST_HEAD(tmp_list); - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) list_move(&p->mnt_hash, &tmp_list); @@ -1247,24 +1076,24 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) p->mnt_ns = NULL; __mnt_make_shortterm(p); list_del_init(&p->mnt_child); - if (p->mnt_parent != p) { + if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; - dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint); + dentry_reset_mounted(p->mnt_mountpoint); } change_mnt_propagation(p, MS_PRIVATE); } list_splice(&tmp_list, kill); } -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt, struct list_head *umounts); -static int do_umount(struct vfsmount *mnt, int flags) +static int do_umount(struct mount *mnt, int flags) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = mnt->mnt.mnt_sb; int retval; LIST_HEAD(umount_list); - retval = security_sb_umount(mnt, flags); + retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; @@ -1275,7 +1104,7 @@ static int do_umount(struct vfsmount *mnt, int flags) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { - if (mnt == current->fs->root.mnt || + if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; @@ -1317,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags) * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ - if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. @@ -1359,6 +1188,7 @@ static int do_umount(struct vfsmount *mnt, int flags) SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { struct path path; + struct mount *mnt; int retval; int lookup_flags = 0; @@ -1371,21 +1201,22 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; + mnt = real_mount(path.mnt); retval = -EINVAL; if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(path.mnt)) + if (!check_mnt(mnt)) goto dput_and_out; retval = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto dput_and_out; - retval = do_umount(path.mnt, flags); + retval = do_umount(mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path.dentry); - mntput_no_expire(path.mnt); + mntput_no_expire(mnt); out: return retval; } @@ -1420,10 +1251,10 @@ static int mount_is_safe(struct path *path) #endif } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct vfsmount *res, *p, *q, *r, *s; + struct mount *res, *p, *q, *r; struct path path; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) @@ -1436,6 +1267,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + struct mount *s; if (!is_subdir(r->mnt_mountpoint, dentry)) continue; @@ -1449,9 +1281,9 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = q; + path.mnt = &q->mnt; path.dentry = p->mnt_mountpoint; - q = clone_mnt(p, p->mnt_root, flag); + q = clone_mnt(p, p->mnt.mnt_root, flag); if (!q) goto Enomem; br_write_lock(vfsmount_lock); @@ -1474,11 +1306,12 @@ Enomem: struct vfsmount *collect_mounts(struct path *path) { - struct vfsmount *tree; + struct mount *tree; down_write(&namespace_sem); - tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); - return tree; + return tree ? &tree->mnt : NULL; } void drop_collected_mounts(struct vfsmount *mnt) @@ -1486,7 +1319,7 @@ void drop_collected_mounts(struct vfsmount *mnt) LIST_HEAD(umount_list); down_write(&namespace_sem); br_write_lock(vfsmount_lock); - umount_tree(mnt, 0, &umount_list); + umount_tree(real_mount(mnt), 0, &umount_list); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); @@ -1495,21 +1328,21 @@ void drop_collected_mounts(struct vfsmount *mnt) int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { - struct vfsmount *mnt; + struct mount *mnt; int res = f(root, arg); if (res) return res; - list_for_each_entry(mnt, &root->mnt_list, mnt_list) { - res = f(mnt, arg); + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); if (res) return res; } return 0; } -static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +static void cleanup_group_ids(struct mount *mnt, struct mount *end) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) @@ -1517,9 +1350,9 @@ static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) } } -static int invent_group_ids(struct vfsmount *mnt, bool recurse) +static int invent_group_ids(struct mount *mnt, bool recurse) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { @@ -1597,13 +1430,13 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse) * Must be called without spinlocks held, since this function can sleep * in allocations. */ -static int attach_recursive_mnt(struct vfsmount *source_mnt, +static int attach_recursive_mnt(struct mount *source_mnt, struct path *path, struct path *parent_path) { LIST_HEAD(tree_list); - struct vfsmount *dest_mnt = path->mnt; + struct mount *dest_mnt = real_mount(path->mnt); struct dentry *dest_dentry = path->dentry; - struct vfsmount *child, *p; + struct mount *child, *p; int err; if (IS_MNT_SHARED(dest_mnt)) { @@ -1624,7 +1457,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); - touch_mnt_namespace(parent_path->mnt->mnt_ns); + touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1672,13 +1505,13 @@ static void unlock_mount(struct path *path) mutex_unlock(&path->dentry->d_inode->i_mutex); } -static int graft_tree(struct vfsmount *mnt, struct path *path) +static int graft_tree(struct mount *mnt, struct path *path) { - if (mnt->mnt_sb->s_flags & MS_NOUSER) + if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; if (S_ISDIR(path->dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt_root->d_inode->i_mode)) + S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; if (d_unlinked(path->dentry)) @@ -1709,7 +1542,8 @@ static int flags_to_propagation_type(int flags) */ static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = path->mnt; + struct mount *m; + struct mount *mnt = real_mount(path->mnt); int recurse = flag & MS_REC; int type; int err = 0; @@ -1749,7 +1583,7 @@ static int do_loopback(struct path *path, char *old_name, { LIST_HEAD(umount_list); struct path old_path; - struct vfsmount *mnt = NULL; + struct mount *mnt = NULL, *old; int err = mount_is_safe(path); if (err) return err; @@ -1763,18 +1597,20 @@ static int do_loopback(struct path *path, char *old_name, if (err) goto out; + old = real_mount(old_path.mnt); + err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_path.mnt)) + if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) goto out2; err = -ENOMEM; if (recurse) - mnt = copy_tree(old_path.mnt, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, 0); else - mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); + mnt = clone_mnt(old, old_path.dentry, 0); if (!mnt) goto out2; @@ -1804,9 +1640,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) return 0; if (readonly_request) - error = mnt_make_readonly(mnt); + error = mnt_make_readonly(real_mount(mnt)); else - __mnt_unmake_readonly(mnt); + __mnt_unmake_readonly(real_mount(mnt)); return error; } @@ -1820,11 +1656,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, { int err; struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!check_mnt(path->mnt)) + if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != path->mnt->mnt_root) @@ -1841,22 +1678,22 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { br_write_lock(vfsmount_lock); - mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; - path->mnt->mnt_flags = mnt_flags; + mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt->mnt.mnt_flags = mnt_flags; br_write_unlock(vfsmount_lock); } up_write(&sb->s_umount); if (!err) { br_write_lock(vfsmount_lock); - touch_mnt_namespace(path->mnt->mnt_ns); + touch_mnt_namespace(mnt->mnt_ns); br_write_unlock(vfsmount_lock); } return err; } -static inline int tree_contains_unbindable(struct vfsmount *mnt) +static inline int tree_contains_unbindable(struct mount *mnt) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; @@ -1867,7 +1704,8 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) static int do_move_mount(struct path *path, char *old_name) { struct path old_path, parent_path; - struct vfsmount *p; + struct mount *p; + struct mount *old; int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1881,8 +1719,11 @@ static int do_move_mount(struct path *path, char *old_name) if (err < 0) goto out; + old = real_mount(old_path.mnt); + p = real_mount(path->mnt); + err = -EINVAL; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + if (!check_mnt(p) || !check_mnt(old)) goto out1; if (d_unlinked(path->dentry)) @@ -1892,7 +1733,7 @@ static int do_move_mount(struct path *path, char *old_name) if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_path.mnt == old_path.mnt->mnt_parent) + if (!mnt_has_parent(old)) goto out1; if (S_ISDIR(path->dentry->d_inode->i_mode) != @@ -1901,28 +1742,26 @@ static int do_move_mount(struct path *path, char *old_name) /* * Don't move a mount residing in a shared parent. */ - if (old_path.mnt->mnt_parent && - IS_MNT_SHARED(old_path.mnt->mnt_parent)) + if (IS_MNT_SHARED(old->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(path->mnt) && - tree_contains_unbindable(old_path.mnt)) + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out1; err = -ELOOP; - for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_path.mnt) + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) goto out1; - err = attach_recursive_mnt(old_path.mnt, path, &parent_path); + err = attach_recursive_mnt(old, path, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_path.mnt->mnt_expire); + list_del_init(&old->mnt_expire); out1: unlock_mount(path); out: @@ -1955,7 +1794,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } -struct vfsmount * +static struct vfsmount * do_kern_mount(const char *fstype, int flags, const char *name, void *data) { struct file_system_type *type = get_fs_type(fstype); @@ -1969,12 +1808,11 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data) put_filesystem(type); return mnt; } -EXPORT_SYMBOL_GPL(do_kern_mount); /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { int err; @@ -1985,20 +1823,20 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag return err; err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt))) goto unlock; /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (path->mnt->mnt_sb == newmnt->mnt_sb && + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) goto unlock; - newmnt->mnt_flags = mnt_flags; + newmnt->mnt.mnt_flags = mnt_flags; err = graft_tree(newmnt, path); unlock: @@ -2027,7 +1865,7 @@ static int do_new_mount(struct path *path, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - err = do_add_mount(mnt, path, mnt_flags); + err = do_add_mount(real_mount(mnt), path, mnt_flags); if (err) mntput(mnt); return err; @@ -2035,11 +1873,12 @@ static int do_new_mount(struct path *path, char *type, int flags, int finish_automount(struct vfsmount *m, struct path *path) { + struct mount *mnt = real_mount(m); int err; /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ - BUG_ON(mnt_get_count(m) < 2); + BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == path->dentry) { @@ -2047,15 +1886,15 @@ int finish_automount(struct vfsmount *m, struct path *path) goto fail; } - err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); if (!err) return 0; fail: /* remove m from any expiration list it may be on */ - if (!list_empty(&m->mnt_expire)) { + if (!list_empty(&mnt->mnt_expire)) { down_write(&namespace_sem); br_write_lock(vfsmount_lock); - list_del_init(&m->mnt_expire); + list_del_init(&mnt->mnt_expire); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); } @@ -2074,7 +1913,7 @@ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) down_write(&namespace_sem); br_write_lock(vfsmount_lock); - list_add_tail(&mnt->mnt_expire, expiry_list); + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); br_write_unlock(vfsmount_lock); up_write(&namespace_sem); @@ -2088,7 +1927,7 @@ EXPORT_SYMBOL(mnt_set_expiry); */ void mark_mounts_for_expiry(struct list_head *mounts) { - struct vfsmount *mnt, *next; + struct mount *mnt, *next; LIST_HEAD(graveyard); LIST_HEAD(umounts); @@ -2111,7 +1950,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { - mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1, &umounts); } @@ -2129,9 +1968,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ -static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +static int select_submounts(struct mount *parent, struct list_head *graveyard) { - struct vfsmount *this_parent = parent; + struct mount *this_parent = parent; struct list_head *next; int found = 0; @@ -2140,10 +1979,10 @@ repeat: resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; - struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; - if (!(mnt->mnt_flags & MNT_SHRINKABLE)) + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. @@ -2175,15 +2014,15 @@ resume: * * vfsmount_lock must be held for write */ -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt, struct list_head *umounts) { LIST_HEAD(graveyard); - struct vfsmount *m; + struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { - m = list_first_entry(&graveyard, struct vfsmount, + m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, 1, umounts); @@ -2370,12 +2209,13 @@ static struct mnt_namespace *alloc_mnt_ns(void) void mnt_make_longterm(struct vfsmount *mnt) { - __mnt_make_longterm(mnt); + __mnt_make_longterm(real_mount(mnt)); } -void mnt_make_shortterm(struct vfsmount *mnt) +void mnt_make_shortterm(struct vfsmount *m) { #ifdef CONFIG_SMP + struct mount *mnt = real_mount(m); if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) return; br_write_lock(vfsmount_lock); @@ -2393,7 +2233,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; - struct vfsmount *p, *q; + struct mount *p, *q; + struct mount *old = mnt_ns->root; + struct mount *new; new_ns = alloc_mnt_ns(); if (IS_ERR(new_ns)) @@ -2401,15 +2243,15 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, down_write(&namespace_sem); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, - CL_COPY_ALL | CL_EXPIRE); - if (!new_ns->root) { + new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); + if (!new) { up_write(&namespace_sem); kfree(new_ns); return ERR_PTR(-ENOMEM); } + new_ns->root = new; br_write_lock(vfsmount_lock); - list_add_tail(&new_ns->list, &new_ns->root->mnt_list); + list_add_tail(&new_ns->list, &new->mnt_list); br_write_unlock(vfsmount_lock); /* @@ -2417,27 +2259,27 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = mnt_ns->root; - q = new_ns->root; + p = old; + q = new; while (p) { q->mnt_ns = new_ns; __mnt_make_longterm(q); if (fs) { - if (p == fs->root.mnt) { - fs->root.mnt = mntget(q); + if (&p->mnt == fs->root.mnt) { + fs->root.mnt = mntget(&q->mnt); __mnt_make_longterm(q); - mnt_make_shortterm(p); - rootmnt = p; + mnt_make_shortterm(&p->mnt); + rootmnt = &p->mnt; } - if (p == fs->pwd.mnt) { - fs->pwd.mnt = mntget(q); + if (&p->mnt == fs->pwd.mnt) { + fs->pwd.mnt = mntget(&q->mnt); __mnt_make_longterm(q); - mnt_make_shortterm(p); - pwdmnt = p; + mnt_make_shortterm(&p->mnt); + pwdmnt = &p->mnt; } } - p = next_mnt(p, mnt_ns->root); - q = next_mnt(q, new_ns->root); + p = next_mnt(p, old); + q = next_mnt(q, new); } up_write(&namespace_sem); @@ -2470,22 +2312,20 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns; - - new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(); if (!IS_ERR(new_ns)) { + struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; __mnt_make_longterm(mnt); new_ns->root = mnt; - list_add(&new_ns->list, &new_ns->root->mnt_list); + list_add(&new_ns->list, &mnt->mnt_list); } else { - mntput(mnt); + mntput(m); } return new_ns; } -EXPORT_SYMBOL(create_mnt_ns); struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) { @@ -2559,6 +2399,31 @@ out_type: } /* + * Return true if path is reachable from root + * + * namespace_sem or vfsmount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +int path_is_under(struct path *path1, struct path *path2) +{ + int res; + br_read_lock(vfsmount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + br_read_unlock(vfsmount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +/* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets @@ -2586,8 +2451,8 @@ out_type: SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct vfsmount *tmp; struct path new, old, parent_path, root_parent, root; + struct mount *new_mnt, *root_mnt; int error; if (!capable(CAP_SYS_ADMIN)) @@ -2611,11 +2476,13 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out3; error = -EINVAL; - if (IS_MNT_SHARED(old.mnt) || - IS_MNT_SHARED(new.mnt->mnt_parent) || - IS_MNT_SHARED(root.mnt->mnt_parent)) + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + if (IS_MNT_SHARED(real_mount(old.mnt)) || + IS_MNT_SHARED(new_mnt->mnt_parent) || + IS_MNT_SHARED(root_mnt->mnt_parent)) goto out4; - if (!check_mnt(root.mnt) || !check_mnt(new.mnt)) + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) @@ -2629,33 +2496,22 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ - if (root.mnt->mnt_parent == root.mnt) + if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ - if (new.mnt->mnt_parent == new.mnt) + if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old.mnt; - if (tmp != new.mnt) { - for (;;) { - if (tmp->mnt_parent == tmp) - goto out4; /* already mounted on put_old */ - if (tmp->mnt_parent == new.mnt) - break; - tmp = tmp->mnt_parent; - } - if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) - goto out4; - } else if (!is_subdir(old.dentry, new.dentry)) + if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) goto out4; br_write_lock(vfsmount_lock); - detach_mnt(new.mnt, &parent_path); - detach_mnt(root.mnt, &root_parent); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root.mnt, &old); + attach_mnt(root_mnt, &old); /* mount new_root on / */ - attach_mnt(new.mnt, &root_parent); + attach_mnt(new_mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(vfsmount_lock); chroot_fs_refs(&root, &new); @@ -2693,8 +2549,8 @@ static void __init init_mount_tree(void) init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); - root.mnt = ns->root; - root.dentry = ns->root->mnt_root; + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -2707,7 +2563,7 @@ void __init mnt_init(void) init_rwsem(&namespace_sem); - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); @@ -2747,7 +2603,6 @@ void put_mnt_ns(struct mnt_namespace *ns) release_mounts(&umount_list); kfree(ns); } -EXPORT_SYMBOL(put_mnt_ns); struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) { @@ -2776,5 +2631,5 @@ EXPORT_SYMBOL(kern_unmount); bool our_mnt(struct vfsmount *mnt) { - return check_mnt(mnt); + return check_mnt(real_mount(mnt)); } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 9c51f621e90..aeed93a6bde 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -30,15 +30,15 @@ static void ncp_do_readdir(struct file *, void *, filldir_t, static int ncp_readdir(struct file *, void *, filldir_t); -static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *); +static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *); static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *); static int ncp_unlink(struct inode *, struct dentry *); -static int ncp_mkdir(struct inode *, struct dentry *, int); +static int ncp_mkdir(struct inode *, struct dentry *, umode_t); static int ncp_rmdir(struct inode *, struct dentry *); static int ncp_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int ncp_mknod(struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev); + umode_t mode, dev_t rdev); #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) extern int ncp_symlink(struct inode *, struct dentry *, const char *); #else @@ -919,7 +919,7 @@ out_close: goto out; } -int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode, +int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev, __le32 attributes) { struct ncp_server *server = NCP_SERVER(dir); @@ -928,7 +928,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, int mode, int opmode; __u8 __name[NCP_MAXPATHLEN + 1]; - PPRINTK("ncp_create_new: creating %s/%s, mode=%x\n", + PPRINTK("ncp_create_new: creating %s/%s, mode=%hx\n", dentry->d_parent->d_name.name, dentry->d_name.name, mode); ncp_age_dentry(server, dentry); @@ -979,13 +979,13 @@ out: return error; } -static int ncp_create(struct inode *dir, struct dentry *dentry, int mode, +static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return ncp_create_new(dir, dentry, mode, 0, 0); } -static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct ncp_entry_info finfo; struct ncp_server *server = NCP_SERVER(dir); @@ -1201,12 +1201,12 @@ out: } static int ncp_mknod(struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { if (!new_valid_dev(rdev)) return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%o\n", mode); + DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); } return -EPERM; /* Strange, but true */ diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index cbd1a61c110..3d1e34f8a68 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -44,7 +44,7 @@ static void ncp_evict_inode(struct inode *); static void ncp_put_super(struct super_block *); static int ncp_statfs(struct dentry *, struct kstatfs *); -static int ncp_show_options(struct seq_file *, struct vfsmount *); +static int ncp_show_options(struct seq_file *, struct dentry *); static struct kmem_cache * ncp_inode_cachep; @@ -60,7 +60,6 @@ static struct inode *ncp_alloc_inode(struct super_block *sb) static void ncp_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode)); } @@ -323,9 +322,9 @@ static void ncp_stop_tasks(struct ncp_server *server) { flush_work_sync(&server->timeout_tq); } -static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int ncp_show_options(struct seq_file *seq, struct dentry *root) { - struct ncp_server *server = NCP_SBP(mnt->mnt_sb); + struct ncp_server *server = NCP_SBP(root->d_sb); unsigned int tmp; if (server->m.uid != 0) diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 790e92a9ec6..6958adfaff0 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -901,7 +901,7 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ret = __ncp_ioctl(inode, cmd, arg); outDropWrite: if (need_drop_write) - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); out: return ret; } diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 09881e6aa5a..32c06587351 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -114,7 +114,7 @@ int ncp_dirhandle_alloc(struct ncp_server *, __u8 vol, __le32 dirent, __u8 *dirh int ncp_dirhandle_free(struct ncp_server *, __u8 dirhandle); int ncp_create_new(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev, __le32 attributes); + umode_t mode, dev_t rdev, __le32 attributes); static inline int ncp_is_nfs_extras(struct ncp_server* server, unsigned int volnum) { #ifdef CONFIG_NCPFS_NFS_NS diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 661f861d80c..52439ddc8de 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -108,7 +108,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { char *rawlink; int length, err, i, outlen; int kludge; - int mode; + umode_t mode; __le32 attr; unsigned int hdr; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 281ae95932c..48cfac31f64 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -90,9 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - struct rpc_call_ops call_ops; - void (*pnfs_callback) (void *data); + void (*pnfs_callback) (void *data, int num_se); void *data; + int bse_count; }; static inline struct parallel_io *alloc_parallel(void *data) @@ -103,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data) if (rv) { rv->data = data; kref_init(&rv->refcnt); + rv->bse_count = 0; } return rv; } @@ -117,7 +118,7 @@ static void destroy_parallel(struct kref *kref) struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); dprintk("%s enter\n", __func__); - p->pnfs_callback(p->data); + p->pnfs_callback(p->data, p->bse_count); kfree(p); } @@ -146,14 +147,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, { struct bio *bio; + npg = min(npg, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, npg); - if (!bio) - return NULL; + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } - bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; - bio->bi_bdev = be->be_mdev; - bio->bi_end_io = end_io; - bio->bi_private = par; + if (bio) { + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + } return bio; } @@ -212,22 +218,15 @@ static void bl_read_cleanup(struct work_struct *work) } static void -bl_end_par_io_read(void *data) +bl_end_par_io_read(void *data, int unused) { struct nfs_read_data *rdata = data; + rdata->task.tk_status = rdata->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); schedule_work(&rdata->task.u.tk_work); } -/* We don't want normal .rpc_call_done callback used, so we replace it - * with this stub. - */ -static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) -{ - return; -} - static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { @@ -247,8 +246,6 @@ bl_read_pagelist(struct nfs_read_data *rdata) par = alloc_parallel(rdata); if (!par) goto use_mds; - par->call_ops = *rdata->mds_ops; - par->call_ops.rpc_call_done = bl_rpc_do_nothing; par->pnfs_callback = bl_end_par_io_read; /* At this point, we can no longer jump to use_mds */ @@ -322,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl, { sector_t isect, end; struct pnfs_block_extent *be; + struct pnfs_block_short_extent *se; dprintk("%s(%llu, %u)\n", __func__, offset, count); if (count == 0) @@ -334,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl, be = bl_find_get_extent(bl, isect, NULL); BUG_ON(!be); /* FIXME */ len = min(end, be->be_f_offset + be->be_length) - isect; - if (be->be_state == PNFS_BLOCK_INVALID_DATA) - bl_mark_for_commit(be, isect, len); /* What if fails? */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + se = bl_pop_one_short_extent(be->be_inval); + BUG_ON(!se); + bl_mark_for_commit(be, isect, len, se); + } isect += len; bl_put_extent(be); } @@ -357,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err) end_page_writeback(page); page_cache_release(page); } while (bvec >= bio->bi_io_vec); - if (!uptodate) { + + if (unlikely(!uptodate)) { if (!wdata->pnfs_error) wdata->pnfs_error = -EIO; pnfs_set_lo_fail(wdata->lseg); @@ -366,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err) put_parallel(par); } -/* This is basically copied from mpage_end_io_read */ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; @@ -392,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work) dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); wdata = container_of(task, struct nfs_write_data, task); - if (!wdata->pnfs_error) { + if (likely(!wdata->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ mark_extents_written(BLK_LSEG2EXT(wdata->lseg), wdata->args.offset, wdata->args.count); @@ -401,11 +402,16 @@ static void bl_write_cleanup(struct work_struct *work) } /* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void bl_end_par_io_write(void *data) +static void bl_end_par_io_write(void *data, int num_se) { struct nfs_write_data *wdata = data; - wdata->task.tk_status = 0; + if (unlikely(wdata->pnfs_error)) { + bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval, + num_se); + } + + wdata->task.tk_status = wdata->pnfs_error; wdata->verf.committed = NFS_FILE_SYNC; INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); schedule_work(&wdata->task.u.tk_work); @@ -484,6 +490,55 @@ cleanup: return ret; } +/* Find or create a zeroing page marked being writeback. + * Return ERR_PTR on error, NULL to indicate skip this page and page itself + * to indicate write out. + */ +static struct page * +bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, + struct pnfs_block_extent *cow_read) +{ + struct page *page; + int locked = 0; + page = find_get_page(inode->i_mapping, index); + if (page) + goto check_page; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s oom\n", __func__); + return ERR_PTR(-ENOMEM); + } + locked = 1; + +check_page: + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + */ + if (PageDirty(page) || PageWriteback(page)) { + print_page(page); + if (locked) + unlock_page(page); + page_cache_release(page); + return NULL; + } + + if (!locked) { + lock_page(page); + locked = 1; + goto check_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + return page; +} + static enum pnfs_try_status bl_write_pagelist(struct nfs_write_data *wdata, int sync) { @@ -508,9 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) */ par = alloc_parallel(wdata); if (!par) - return PNFS_NOT_ATTEMPTED; - par->call_ops = *wdata->mds_ops; - par->call_ops.rpc_call_done = bl_rpc_do_nothing; + goto out_mds; par->pnfs_callback = bl_end_par_io_write; /* At this point, have to be more careful with error handling */ @@ -518,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); if (!be || !is_writable(be, isect)) { dprintk("%s no matching extents!\n", __func__); - wdata->pnfs_error = -EINVAL; - goto out; + goto out_mds; } /* First page inside INVALID extent */ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else + goto out_mds; temp = offset >> PAGE_CACHE_SHIFT; npg_zero = do_div(temp, npg_per_block); isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & @@ -543,36 +599,16 @@ fill_invalid_ext: dprintk("%s zero %dth page: index %lu isect %llu\n", __func__, npg_zero, index, (unsigned long long)isect); - page = - find_or_create_page(wdata->inode->i_mapping, index, - GFP_NOFS); - if (!page) { - dprintk("%s oom\n", __func__); - wdata->pnfs_error = -ENOMEM; + page = bl_find_get_zeroing_page(wdata->inode, index, + cow_read); + if (unlikely(IS_ERR(page))) { + wdata->pnfs_error = PTR_ERR(page); goto out; - } - - /* PageDirty: Other will write this out - * PageWriteback: Other is writing this out - * PageUptodate: It was read before - * sector_initialized: already written out - */ - if (PageDirty(page) || PageWriteback(page)) { - print_page(page); - unlock_page(page); - page_cache_release(page); + } else if (page == NULL) goto next_page; - } - if (!PageUptodate(page)) { - /* New page, readin or zero it */ - init_page_for_write(page, cow_read); - } - set_page_writeback(page); - unlock_page(page); ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS, - NULL); + PAGE_CACHE_SECTORS); if (unlikely(ret)) { dprintk("%s bl_mark_sectors_init fail %d\n", __func__, ret); @@ -581,6 +617,19 @@ fill_invalid_ext: wdata->pnfs_error = ret; goto out; } + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else { + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = -ENOMEM; + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, isect, page, be, bl_end_io_write_zero, par); @@ -589,10 +638,6 @@ fill_invalid_ext: bio = NULL; goto out; } - /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(wdata->lseg), - page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); next_page: isect += PAGE_CACHE_SECTORS; extent_length -= PAGE_CACHE_SECTORS; @@ -616,13 +661,21 @@ next_page: wdata->pnfs_error = -EINVAL; goto out; } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent( + be->be_inval))) + par->bse_count++; + else { + wdata->pnfs_error = -ENOMEM; + goto out; + } + } extent_length = be->be_length - (isect - be->be_f_offset); } if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS, - NULL); + PAGE_CACHE_SECTORS); if (unlikely(ret)) { dprintk("%s bl_mark_sectors_init fail %d\n", __func__, ret); @@ -664,6 +717,10 @@ out: bl_submit_bio(WRITE, bio); put_parallel(par); return PNFS_ATTEMPTED; +out_mds: + bl_put_extent(be); + kfree(par); + return PNFS_NOT_ATTEMPTED; } /* FIXME - range ignored */ @@ -690,11 +747,17 @@ static void release_inval_marks(struct pnfs_inval_markings *marks) { struct pnfs_inval_tracking *pos, *temp; + struct pnfs_block_short_extent *se, *stemp; list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { list_del(&pos->it_link); kfree(pos); } + + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + } return; } @@ -779,16 +842,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) static void free_blk_mountid(struct block_mount_id *mid) { if (mid) { - struct pnfs_block_dev *dev; - spin_lock(&mid->bm_lock); - while (!list_empty(&mid->bm_devlist)) { - dev = list_first_entry(&mid->bm_devlist, - struct pnfs_block_dev, - bm_node); + struct pnfs_block_dev *dev, *tmp; + + /* No need to take bm_lock as we are last user freeing bm_devlist */ + list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { list_del(&dev->bm_node); bl_free_block_dev(dev); } - spin_unlock(&mid->bm_lock); kfree(mid); } } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 42acf7ef599..e31a2df28e7 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -70,6 +70,7 @@ struct pnfs_inval_markings { spinlock_t im_lock; struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ sector_t im_block_size; /* Server blocksize in sectors */ + struct list_head im_extents; /* Short extents for INVAL->RW conversion */ }; struct pnfs_inval_tracking { @@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) { spin_lock_init(&marks->im_lock); INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + INIT_LIST_HEAD(&marks->im_extents); marks->im_block_size = blocksize; marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, blocksize); @@ -186,8 +188,7 @@ struct pnfs_block_extent * bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent **cow_read); int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length, - sector_t **pages); + sector_t offset, sector_t length); void bl_put_extent(struct pnfs_block_extent *be); struct pnfs_block_extent *bl_alloc_extent(void); int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); @@ -200,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length); + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new); +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 19fa7b0b8c0..1abac09f7cd 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, return 0; } else { struct pnfs_inval_tracking *new; - if (storage) - new = storage; - else { - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return -ENOMEM; - } + new = storage; new->it_sector = s; new->it_tags = (1 << tag); list_add(&new->it_link, &pos->it_link); @@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) } /* Ensure that future operations on given range of tree will not malloc */ -static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +static int _preload_range(struct pnfs_inval_markings *marks, + u64 offset, u64 length) { u64 start, end, s; int count, i, used = 0, status = -ENOMEM; struct pnfs_inval_tracking **storage; + struct my_tree *tree = &marks->im_tree; dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); start = normalize(offset, tree->mtt_step_size); @@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length) goto out_cleanup; } - /* Now need lock - HOW??? */ - + spin_lock_bh(&marks->im_lock); for (s = start; s < end; s += tree->mtt_step_size) used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + spin_unlock_bh(&marks->im_lock); - /* Unlock - HOW??? */ status = 0; out_cleanup: @@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length) return status; } -static void set_needs_init(sector_t *array, sector_t offset) -{ - sector_t *p = array; - - dprintk("%s enter\n", __func__); - if (!p) - return; - while (*p < offset) - p++; - if (*p == offset) - return; - else if (*p == ~0) { - *p++ = offset; - *p = ~0; - return; - } else { - sector_t *save = p; - dprintk("%s Adding %llu\n", __func__, (u64)offset); - while (*p != ~0) - p++; - p++; - memmove(save + 1, save, (char *)p - (char *)save); - *save = offset; - return; - } -} - /* We are relying on page lock to serialize this */ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) { int rv; - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return rv; } @@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks, { int rv; - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return rv; } /* Marks sectors in [offest, offset_length) as having been initialized. * All lengths are step-aligned, where step is min(pagesize, blocksize). - * Notes where partial block is initialized, and helps prepare it for - * complete initialization later. + * Currently assumes offset is page-aligned */ -/* Currently assumes offset is page-aligned */ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length, - sector_t **pages) + sector_t offset, sector_t length) { - sector_t s, start, end; - sector_t *array = NULL; /* Pages to mark */ + sector_t start, end; dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, (u64)offset, (u64)length); - s = max((sector_t) 3, - 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); - dprintk("%s set max=%llu\n", __func__, (u64)s); - if (pages) { - array = kmalloc(s * sizeof(sector_t), GFP_NOFS); - if (!array) - goto outerr; - array[0] = ~0; - } start = normalize(offset, marks->im_block_size); end = normalize_up(offset + length, marks->im_block_size); - if (_preload_range(&marks->im_tree, start, end - start)) + if (_preload_range(marks, start, end - start)) goto outerr; - spin_lock(&marks->im_lock); - - for (s = normalize_up(start, PAGE_CACHE_SECTORS); - s < offset; s += PAGE_CACHE_SECTORS) { - dprintk("%s pre-area pages\n", __func__); - /* Portion of used block is not initialized */ - if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) - set_needs_init(array, s); - } + spin_lock_bh(&marks->im_lock); if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) goto out_unlock; - for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); - s < end; s += PAGE_CACHE_SECTORS) { - dprintk("%s post-area pages\n", __func__); - if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) - set_needs_init(array, s); - } - - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); - if (pages) { - if (array[0] == ~0) { - kfree(array); - *pages = NULL; - } else - *pages = array; - } return 0; - out_unlock: - spin_unlock(&marks->im_lock); - outerr: - if (pages) { - kfree(array); - *pages = NULL; - } +out_unlock: + spin_unlock_bh(&marks->im_lock); +outerr: return -ENOMEM; } @@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks, dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, (u64)offset, (u64)length); - spin_lock(&marks->im_lock); + spin_lock_bh(&marks->im_lock); status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); - spin_unlock(&marks->im_lock); + spin_unlock_bh(&marks->im_lock); return status; } @@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl, /* Note the range described by offset, length is guaranteed to be contained * within be. + * new will be freed, either by this function or add_to_commitlist if they + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. */ int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length) + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new) { sector_t new_end, end = offset + length; - struct pnfs_block_short_extent *new; struct pnfs_block_layout *bl = container_of(be->be_inval, struct pnfs_block_layout, bl_inval); - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return -ENOMEM; - mark_written_sectors(be->be_inval, offset, length); /* We want to add the range to commit list, but it must be * block-normalized, and verified that the normalized range has @@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be, new->bse_mdev = be->be_mdev; spin_lock(&bl->bl_ext_lock); - /* new will be freed, either by add_to_commitlist if it decides not - * to use it, or after LAYOUTCOMMIT uses it in the commitlist. - */ add_to_commitlist(bl, new); spin_unlock(&bl->bl_ext_lock); return 0; @@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, } } } + +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *new; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (unlikely(!new)) + return -ENOMEM; + + spin_lock_bh(&marks->im_lock); + list_add(&new->bse_node, &marks->im_extents); + spin_unlock_bh(&marks->im_lock); + + return 0; +} + +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *rv = NULL; + + spin_lock_bh(&marks->im_lock); + if (!list_empty(&marks->im_extents)) { + rv = list_entry((&marks->im_extents)->next, + struct pnfs_block_short_extent, bse_node); + list_del_init(&rv->bse_node); + } + spin_unlock_bh(&marks->im_lock); + + return rv; +} + +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) +{ + struct pnfs_block_short_extent *se = NULL, *tmp; + + if (num_to_free <= 0) + return; + + spin_lock(&marks->im_lock); + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + if (--num_to_free == 0) + break; + } + spin_unlock(&marks->im_lock); + + BUG_ON(num_to_free > 0); +} diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 07df5f1d85e..c89d3b9e483 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -162,7 +162,7 @@ struct cb_layoutrecallargs { }; }; -extern unsigned nfs4_callback_layoutrecall( +extern __be32 nfs4_callback_layoutrecall( struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 43926add945..54cea8ad5a7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s enter. slotid %d seqid %d\n", __func__, args->csa_slotid, args->csa_sequenceid); - if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); slot = tbl->slots + args->csa_slotid; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 726e59a9e50..d50b2742f23 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, n = ntohl(*p++); if (n <= 0) goto out; + if (n > ULONG_MAX / sizeof(*args->devs)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 873bf00d51a..31778f74357 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -84,7 +84,7 @@ retry: /* * Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ -static int nfs4_disable_idmapping = 0; +static bool nfs4_disable_idmapping = true; /* * RPC cruft for NFS @@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif - cred = rpc_lookup_machine_cred(); + cred = rpc_lookup_machine_cred("*"); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; nfs_fscache_get_client_cookie(clp); @@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server) rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs4_purge_state_owners(server); +} + #else static void nfs4_shutdown_client(struct nfs_client *clp) { @@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); + INIT_LIST_HEAD(&server->state_owners_lru); atomic_set(&server->active, 0); @@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, nfs_server_insert_lists(server); server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; out: nfs_free_fattr(fattr); return error; @@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, /* Copy data from the source */ server->nfs_client = source->nfs_client; + server->destroy = source->destroy; atomic_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ac289909814..fd9a872fada 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -47,13 +47,13 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); -static int nfs_mkdir(struct inode *, struct dentry *, int); +static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); +static int nfs_mkdir(struct inode *, struct dentry *, umode_t); static int nfs_rmdir(struct inode *, struct dentry *); static int nfs_unlink(struct inode *, struct dentry *); static int nfs_symlink(struct inode *, struct dentry *, const char *); static int nfs_link(struct dentry *, struct inode *, struct dentry *); -static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); +static int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); @@ -112,7 +112,7 @@ const struct inode_operations nfs3_dir_inode_operations = { #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); +static int nfs_open_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd); const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_open_create, .lookup = nfs_atomic_lookup, @@ -1368,18 +1368,7 @@ static fmode_t flags_to_mode(int flags) static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) { - struct nfs_open_context *ctx; - struct rpc_cred *cred; - fmode_t fmode = flags_to_mode(open_flags); - - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return ERR_CAST(cred); - ctx = alloc_nfs_open_context(dentry, cred, fmode); - put_rpccred(cred); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - return ctx; + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); } static int do_open(struct inode *inode, struct file *filp) @@ -1584,8 +1573,8 @@ no_open: return nfs_lookup_revalidate(dentry, nd); } -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int nfs_open_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct nfs_open_context *ctx = NULL; struct iattr attr; @@ -1675,8 +1664,8 @@ out_error: * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int nfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, struct nameidata *nd) { struct iattr attr; int error; @@ -1704,7 +1693,7 @@ out_err: * See comments for nfs_proc_create regarding failed operations. */ static int -nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; @@ -1730,7 +1719,7 @@ out_err: /* * See comments for nfs_proc_create regarding failed operations. */ -static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; int error; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 606ef0f20ae..c43a452f7da 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) datasync); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; mutex_lock(&inode->i_mutex); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); + if (status >= 0 && ret < 0) + status = ret; have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); if (have_error) ret = xchg(&ctx->error, 0); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 47d1c6ff2d8..2c05f1991e1 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -38,6 +38,89 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) +{ + fattr->owner_name = owner_name; + fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + __u32 uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + __u32 gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 50a15fa8cf9..f649fba8c38 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -38,6 +38,7 @@ #include <linux/nfs_xdr.h> #include <linux/slab.h> #include <linux/compat.h> +#include <linux/freezer.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ -static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); @@ -77,7 +78,7 @@ int nfs_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - schedule(); + freezable_schedule(); return 0; } @@ -629,23 +630,28 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) nfs_revalidate_inode(server, inode); } -struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) { struct nfs_open_context *ctx; + struct rpc_cred *cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (ctx != NULL) { - nfs_sb_active(dentry->d_sb); - ctx->dentry = dget(dentry); - ctx->cred = get_rpccred(cred); - ctx->state = NULL; - ctx->mode = f_mode; - ctx->flags = 0; - ctx->error = 0; - nfs_init_lock_context(&ctx->lock_context); - ctx->lock_context.open_context = ctx; - INIT_LIST_HEAD(&ctx->list); + if (!ctx) { + put_rpccred(cred); + return ERR_PTR(-ENOMEM); } + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); + ctx->cred = cred; + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); return ctx; } @@ -738,15 +744,10 @@ static void nfs_file_clear_open_context(struct file *filp) int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct rpc_cred *cred; - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode); - put_rpccred(cred); - if (ctx == NULL) - return -ENOMEM; + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_set_inode_cookie(inode, filp); @@ -1019,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->valid = 0; fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); + fattr->owner_name = NULL; + fattr->group_name = NULL; } struct nfs_fattr *nfs_alloc_fattr(void) @@ -1464,7 +1467,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) static void nfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3f4d95751d5..8102db9b926 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head); +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); @@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d4bc9ed9174..91943953a37 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -17,6 +17,7 @@ #include <linux/nfs_page.h> #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> +#include <linux/freezer.h> #include "iostat.h" #include "internal.h" @@ -32,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX && res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 693ae22f873..4d7d0aedc10 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -94,6 +94,8 @@ struct nfs_unique_id { struct nfs4_state_owner { struct nfs_unique_id so_owner_id; struct nfs_server *so_server; + struct list_head so_lru; + unsigned long so_expires; struct rb_node so_server_node; struct rpc_cred *so_cred; /* Associated cred */ @@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern void nfs4_purge_state_owners(struct nfs_server *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, fmode_t); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index a62d36b9a99..71ec08617e2 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, loff_t offset) { u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; - u64 tmp; + u64 stripe_no; + u32 rem; offset -= flseg->pattern_offset; - tmp = offset; - do_div(tmp, stripe_width); + stripe_no = div_u64(offset, stripe_width); + div_u64_rem(offset, flseg->stripe_unit, &rem); - return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); + return stripe_no * flseg->stripe_unit + rem; } /* This function is used by the layout driver to calculate the diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index ed388aae968..8ae91908f5a 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) { struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; - u32 port; + __be16 port; int nlen, rlen; int tmp[2]; __be32 *p; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d9f4d78c341..ec9f6ef6c5d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -52,9 +52,11 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> +#include <linux/nfs_idmap.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/xattr.h> #include <linux/utsname.h> +#include <linux/freezer.h> #include "nfs4_fs.h" #include "delegation.h" @@ -243,7 +245,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; - schedule_timeout_killable(*timeout); + freezable_schedule_timeout_killable(*timeout); if (fatal_signal_pending(current)) res = -ERESTARTSYS; *timeout <<= 1; @@ -363,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp * Must be called while holding tbl->slot_tbl_lock */ static void -nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) +nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) { - int free_slotid = free_slot - tbl->slots; int slotid = free_slotid; BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); @@ -430,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) } spin_lock(&tbl->slot_tbl_lock); - nfs4_free_slot(tbl, res->sr_slot); + nfs4_free_slot(tbl, res->sr_slot - tbl->slots); nfs4_check_drain_fc_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; @@ -553,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session, spin_lock(&tbl->slot_tbl_lock); if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { - /* - * The state manager will wait until the slot table is empty. - * Schedule the reset thread - */ + /* The state manager will wait until the slot table is empty */ rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s Schedule Session Reset\n", __func__); + dprintk("%s session is draining\n", __func__); return -EAGAIN; } @@ -764,6 +762,8 @@ struct nfs4_opendata { struct nfs_openres o_res; struct nfs_open_confirmargs c_arg; struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; struct nfs_fattr f_attr; struct nfs_fattr dir_attr; struct dentry *dir; @@ -787,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); + nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, @@ -818,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.dir_bitmask = server->cache_consistency_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (flags & O_CREAT) { u32 *s; @@ -854,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref) dput(p->dir); dput(p->dentry); nfs_sb_deactive(sb); + nfs_fattr_free_names(&p->f_attr); kfree(p); } @@ -1578,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) if (status != 0 || !data->rpc_done) return status; + nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); + nfs_refresh_inode(dir, o_res->dir_attr); if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -1610,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } + nfs_fattr_map_and_free_names(server, &data->f_attr); + if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); nfs_post_op_update_inode(dir, o_res->dir_attr); @@ -3430,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) */ #define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) -static void buf_to_pages(const void *buf, size_t buflen, - struct page **pages, unsigned int *pgbase) -{ - const void *p = buf; - - *pgbase = offset_in_page(buf); - p -= *pgbase; - while (p < buf + buflen) { - *(pages++) = virt_to_page(p); - p += PAGE_CACHE_SIZE; - } -} - static int buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages, unsigned int *pgbase) { @@ -3539,9 +3533,19 @@ out: nfs4_set_cached_acl(inode, acl); } +/* + * The getxattr API returns the required buffer length when called with a + * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating + * the required buf. On a NULL buf, we send a page of data to the server + * guessing that the ACL request can be serviced by a page. If so, we cache + * up to the page of ACL data, and the 2nd call to getxattr is serviced by + * the cache. If not so, we throw away the page, and cache the required + * length. The next getxattr call will then produce another round trip to + * the server, this time with the input buf of the required size. + */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES]; + struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -3556,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - struct page *localpage = NULL; - int ret; + int ret = -ENOMEM, npages, i, acl_len = 0; - if (buflen < PAGE_SIZE) { - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - localpage = alloc_page(GFP_KERNEL); - resp_buf = page_address(localpage); - if (localpage == NULL) - return -ENOMEM; - args.acl_pages[0] = localpage; - args.acl_pgbase = 0; - args.acl_len = PAGE_SIZE; - } else { - resp_buf = buf; - buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); + npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + if (npages == 0) + npages = 1; + + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + if (npages > 1) { + /* for decoding across pages */ + res.acl_scratch = alloc_page(GFP_KERNEL); + if (!res.acl_scratch) + goto out_free; } - ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); + args.acl_len = npages * PAGE_SIZE; + args.acl_pgbase = 0; + /* Let decode_getfacl know not to fail if the ACL data is larger than + * the page we send as a guess */ + if (buf == NULL) + res.acl_flags |= NFS4_ACL_LEN_REQUEST; + resp_buf = page_address(pages[0]); + + dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", + __func__, buf, buflen, npages, args.acl_len); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), + &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; - if (res.acl_len > args.acl_len) - nfs4_write_cached_acl(inode, NULL, res.acl_len); + + acl_len = res.acl_len - res.acl_data_offset; + if (acl_len > args.acl_len) + nfs4_write_cached_acl(inode, NULL, acl_len); else - nfs4_write_cached_acl(inode, resp_buf, res.acl_len); + nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset, + acl_len); if (buf) { ret = -ERANGE; - if (res.acl_len > buflen) + if (acl_len > buflen) goto out_free; - if (localpage) - memcpy(buf, resp_buf, res.acl_len); + _copy_from_pages(buf, pages, res.acl_data_offset, + res.acl_len); } - ret = res.acl_len; + ret = acl_len; out_free: - if (localpage) - __free_page(localpage); + for (i = 0; i < npages; i++) + if (pages[i]) + __free_page(pages[i]); + if (res.acl_scratch) + __free_page(res.acl_scratch); return ret; } @@ -3621,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) + /* -ENOENT is returned if there is no ACL or if there is an ACL + * but no cached acl data, just the acl length */ return ret; return nfs4_get_acl_uncached(inode, buf, buflen); } @@ -3958,7 +3983,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; @@ -4858,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->cl_rpcclient->cl_auth->au_flavor); res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); - if (unlikely(!res.server_scope)) - return -ENOMEM; + if (unlikely(!res.server_scope)) { + status = -ENOMEM; + goto out; + } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) @@ -4876,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->server_scope = NULL; } - if (!clp->server_scope) + if (!clp->server_scope) { clp->server_scope = res.server_scope; - else - kfree(res.server_scope); + goto out; + } } - + kfree(res.server_scope); +out: dprintk("<-- %s status= %d\n", __func__, status); return status; } @@ -4983,37 +5011,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } +static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +{ + return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); +} + +static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, + struct nfs4_slot *new, + u32 max_slots, + u32 ivalue) +{ + struct nfs4_slot *old = NULL; + u32 i; + + spin_lock(&tbl->slot_tbl_lock); + if (new) { + old = tbl->slots; + tbl->slots = new; + tbl->max_slots = max_slots; + } + tbl->highest_used_slotid = -1; /* no slot is currently used */ + for (i = 0; i < tbl->max_slots; i++) + tbl->slots[i].seq_nr = ivalue; + spin_unlock(&tbl->slot_tbl_lock); + kfree(old); +} + /* - * Reset a slot table + * (re)Initialise a slot table */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, - int ivalue) +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + u32 ivalue) { struct nfs4_slot *new = NULL; - int i; - int ret = 0; + int ret = -ENOMEM; dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, max_reqs, tbl->max_slots); /* Does the newly negotiated max_reqs match the existing slot table? */ if (max_reqs != tbl->max_slots) { - ret = -ENOMEM; - new = kmalloc(max_reqs * sizeof(struct nfs4_slot), - GFP_NOFS); + new = nfs4_alloc_slots(max_reqs, GFP_NOFS); if (!new) goto out; - ret = 0; - kfree(tbl->slots); - } - spin_lock(&tbl->slot_tbl_lock); - if (new) { - tbl->slots = new; - tbl->max_slots = max_reqs; } - for (i = 0; i < tbl->max_slots; ++i) - tbl->slots[i].seq_nr = ivalue; - spin_unlock(&tbl->slot_tbl_lock); + ret = 0; + + nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); out: @@ -5021,23 +5065,6 @@ out: return ret; } -/* - * Reset the forechannel and backchannel slot tables - */ -static int nfs4_reset_slot_tables(struct nfs4_session *session) -{ - int status; - - status = nfs4_reset_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, 1); - if (status) - return status; - - status = nfs4_reset_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, 0); - return status; -} - /* Destroy the slot table */ static void nfs4_destroy_slot_tables(struct nfs4_session *session) { @@ -5053,59 +5080,26 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) } /* - * Initialize slot table + * Initialize or reset the forechannel and backchannel tables */ -static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, - int max_slots, int ivalue) -{ - struct nfs4_slot *slot; - int ret = -ENOMEM; - - BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); - - dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - - slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); - if (!slot) - goto out; - ret = 0; - - spin_lock(&tbl->slot_tbl_lock); - tbl->max_slots = max_slots; - tbl->slots = slot; - tbl->highest_used_slotid = -1; /* no slot is currently used */ - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, - tbl, tbl->slots, tbl->max_slots); -out: - dprintk("<-- %s: return %d\n", __func__, ret); - return ret; -} - -/* - * Initialize the forechannel and backchannel tables - */ -static int nfs4_init_slot_tables(struct nfs4_session *session) +static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) { struct nfs4_slot_table *tbl; - int status = 0; - - tbl = &session->fc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, - session->fc_attrs.max_reqs, 1); - if (status) - return status; - } - - tbl = &session->bc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, - session->bc_attrs.max_reqs, 0); - if (status) - nfs4_destroy_slot_tables(session); - } + int status; + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; + /* Back channel */ + tbl = &ses->bc_slot_table; + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); return status; } @@ -5293,13 +5287,9 @@ int nfs4_proc_create_session(struct nfs_client *clp) if (status) goto out; - /* Init and reset the fore channel */ - status = nfs4_init_slot_tables(session); - dprintk("slot table initialization returned %d\n", status); - if (status) - goto out; - status = nfs4_reset_slot_tables(session); - dprintk("slot table reset returned %d\n", status); + /* Init or reset the session slot tables */ + status = nfs4_setup_session_slot_tables(session); + dprintk("slot table setup returned %d\n", status); if (status) goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6a7107ae6b7..45392032e7b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/bitops.h> +#include <linux/jiffies.h> #include "nfs4_fs.h" #include "callback.h" @@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) { struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; - struct nfs4_state_owner *sp, *res = NULL; + struct nfs4_state_owner *sp; while (*p != NULL) { parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); - if (server < sp->so_server) { - p = &parent->rb_left; - continue; - } - if (server > sp->so_server) { - p = &parent->rb_right; - continue; - } if (cred < sp->so_cred) p = &parent->rb_left; else if (cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); - res = sp; - break; + return sp; } } - return res; + return NULL; } static struct nfs4_state_owner * @@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) else if (new->so_cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); return sp; } @@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void) spin_lock_init(&sp->so_sequence.lock); INIT_LIST_HEAD(&sp->so_sequence.list); atomic_set(&sp->so_count, 1); + INIT_LIST_HEAD(&sp->so_lru); return sp; } @@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp) } } +static void nfs4_free_state_owner(struct nfs4_state_owner *sp) +{ + rpc_destroy_wait_queue(&sp->so_sequence.wait); + put_rpccred(sp->so_cred); + kfree(sp); +} + +static void nfs4_gc_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + unsigned long time_min, time_max; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + time_max = jiffies; + time_min = (long)time_max - (long)clp->cl_lease_time; + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + /* NB: LRU is sorted so that oldest is at the head */ + if (time_in_range(sp->so_expires, time_min, time_max)) + break; + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } +} + /** * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search @@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, sp = nfs4_find_state_owner_locked(server, cred); spin_unlock(&clp->cl_lock); if (sp != NULL) - return sp; + goto out; new = nfs4_alloc_state_owner(); if (new == NULL) - return NULL; + goto out; new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); @@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, rpc_destroy_wait_queue(&new->so_sequence.wait); kfree(new); } +out: + nfs4_gc_state_owners(server); return sp; } /** * nfs4_put_state_owner - Release a nfs4_state_owner * @sp: state owner data to release - * */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_server->nfs_client; - struct rpc_cred *cred = sp->so_cred; + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - nfs4_remove_state_owner_locked(sp); + + if (!RB_EMPTY_NODE(&sp->so_server_node)) { + sp->so_expires = jiffies; + list_add_tail(&sp->so_lru, &server->state_owners_lru); + spin_unlock(&clp->cl_lock); + } else { + nfs4_remove_state_owner_locked(sp); + spin_unlock(&clp->cl_lock); + nfs4_free_state_owner(sp); + } +} + +/** + * nfs4_purge_state_owners - Release all cached state owners + * @server: nfs_server with cached state owners to release + * + * Called at umount time. Remaining state owners will be on + * the LRU with ref count of zero. + */ +void nfs4_purge_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } spin_unlock(&clp->cl_lock); - rpc_destroy_wait_queue(&sp->so_sequence.wait); - put_rpccred(cred); - kfree(sp); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } } static struct nfs4_state * @@ -1071,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 { struct nfs_client *clp = server->nfs_client; + if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags)) + nfs_async_inode_return_delegation(state->inode, &state->stateid); nfs4_state_mark_reclaim_nograce(clp, state); nfs4_schedule_state_manager(clp); } @@ -1402,6 +1465,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + nfs4_purge_state_owners(server); spin_lock(&clp->cl_lock); for (pos = rb_first(&server->state_owners); pos != NULL; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6161b213ed..33bd8d0f745 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getfh(xdr, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_restorefh(xdr, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + encode_getfattr(xdr, args->dir_bitmask, &hdr); encode_nops(&hdr); } @@ -2517,11 +2517,12 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz + 1; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); + encode_nops(&hdr); } @@ -3790,7 +3791,8 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *uid, int may_sleep) + const struct nfs_server *server, uint32_t *uid, + struct nfs4_string *owner_name) { uint32_t len; __be32 *p; @@ -3807,8 +3809,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; - if (!may_sleep) { - /* do nothing */ + if (owner_name != NULL) { + owner_name->data = kmemdup(p, len, GFP_NOWAIT); + if (owner_name->data != NULL) { + owner_name->len = len; + ret = NFS_ATTR_FATTR_OWNER_NAME; + } } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; @@ -3828,7 +3834,8 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - const struct nfs_server *server, uint32_t *gid, int may_sleep) + const struct nfs_server *server, uint32_t *gid, + struct nfs4_string *group_name) { uint32_t len; __be32 *p; @@ -3845,8 +3852,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, p = xdr_inline_decode(xdr, len); if (unlikely(!p)) goto out_overflow; - if (!may_sleep) { - /* do nothing */ + if (group_name != NULL) { + group_name->data = kmemdup(p, len, GFP_NOWAIT); + if (group_name->data != NULL) { + group_name->len = len; + ret = NFS_ATTR_FATTR_GROUP_NAME; + } } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; @@ -4283,7 +4294,7 @@ xdr_error: static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, - const struct nfs_server *server, int may_sleep) + const struct nfs_server *server) { int status; umode_t fmode = 0; @@ -4350,12 +4361,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -4396,7 +4407,7 @@ xdr_error: } static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, - struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) + struct nfs_fh *fh, const struct nfs_server *server) { __be32 *savep; uint32_t attrlen, @@ -4415,7 +4426,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server); if (status < 0) goto xdr_error; @@ -4426,9 +4437,9 @@ xdr_error: } static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, - const struct nfs_server *server, int may_sleep) + const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); + return decode_getfattr_generic(xdr, fattr, NULL, server); } /* @@ -4957,17 +4968,18 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - size_t *acl_len) + struct nfs_getaclres *res) { - __be32 *savep; + __be32 *savep, *bm_p; uint32_t attrlen, bitmap[3] = {0}; struct kvec *iov = req->rq_rcv_buf.head; int status; - *acl_len = 0; + res->acl_len = 0; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto out; + bm_p = xdr->p; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto out; if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) @@ -4979,18 +4991,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, size_t hdrlen; u32 recvd; + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + xdr->p = bm_p; + res->acl_data_offset = be32_to_cpup(bm_p) + 2; + res->acl_data_offset <<= 2; + /* We ignore &savep and don't do consistency checks on * the attr length. Let userspace figure it out.... */ hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; + attrlen += res->acl_data_offset; recvd = req->rq_rcv_buf.len - hdrlen; if (attrlen > recvd) { - dprintk("NFS: server cheating in getattr" - " acl reply: attrlen %u > recvd %u\n", + if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { + /* getxattr interface called with a NULL buf */ + res->acl_len = attrlen; + goto out; + } + dprintk("NFS: acl reply: attrlen %u > recvd %u\n", attrlen, recvd); return -EINVAL; } xdr_read_pages(xdr, attrlen); - *acl_len = attrlen; + res->acl_len = attrlen; } else status = -EOPNOTSUPP; @@ -5696,8 +5720,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, status = decode_open_downgrade(xdr, res); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5723,8 +5746,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, res); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5753,8 +5775,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server - ,!RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5780,8 +5801,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, goto out; status = decode_getfh(xdr, res->fh); if (status == 0) - status = decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5807,8 +5827,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_remove(xdr, &res->cinfo); if (status) goto out; - decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_attr, res->server); out: return status; } @@ -5841,14 +5860,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; /* Current FH is target directory */ - if (decode_getfattr(xdr, res->new_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->new_fattr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->old_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->old_fattr, res->server); out: return status; } @@ -5884,14 +5901,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->dir_attr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -5923,14 +5938,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - if (decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->fattr, res->server)) goto out; status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->dir_fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_fattr, res->server); out: return status; } @@ -5962,8 +5975,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6019,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; + if (res->acl_scratch != NULL) { + void *p = page_address(res->acl_scratch); + xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); + } status = decode_compound_hdr(xdr, &hdr); if (status) goto out; @@ -6028,7 +6044,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, &res->acl_len); + status = decode_getacl(xdr, rqstp, res); out: return status; @@ -6061,8 +6077,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6093,13 +6108,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, goto out; if (decode_getfh(xdr, &res->fh) != 0) goto out; - if (decode_getfattr(xdr, res->f_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)) != 0) + if (decode_getfattr(xdr, res->f_attr, res->server) != 0) goto out; if (decode_restorefh(xdr) != 0) goto out; - decode_getfattr(xdr, res->dir_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->dir_attr, res->server); out: return status; } @@ -6147,8 +6160,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, status = decode_open(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->f_attr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->f_attr, res->server); out: return status; } @@ -6175,8 +6187,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, status = decode_setattr(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6356,8 +6367,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; if (res->fattr) - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); if (!status) status = res->count; out: @@ -6386,8 +6396,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; if (res->fattr) - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6546,8 +6555,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, status = decode_delegreturn(xdr); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6576,8 +6584,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr(xdr, &res->fs_locations->fattr, - res->fs_locations->server, - !RPC_IS_ASYNC(req->rq_task)); + res->fs_locations->server); out: return status; } @@ -6826,8 +6833,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, status = decode_layoutcommit(xdr, rqstp, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6958,7 +6964,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - entry->server, 1) < 0) + entry->server) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c807ab93140..55d01280a60 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = { static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 72074e3a04f..b3c29039f5b 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) oir->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; + else + rdata->pnfs_error = status; objlayout_iodone(oir); /* must not use oir after this point */ @@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) if (status >= 0) { wdata->res.count = status; wdata->verf.committed = oir->committed; + } else { + wdata->pnfs_error = status; } objlayout_iodone(oir); /* must not use oir after this point */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8e672a2b2d6..17149a49006 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); +static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head) +{ + struct nfs_pageio_descriptor pgio; + LIST_HEAD(failed); + + /* Resend all requests through the MDS */ + nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE); + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(&pgio, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(&pgio); + + if (!list_empty(&failed)) { + /* For some reason our attempt to resend pages. Mark the + * overall send request as having failed, and let + * nfs_writeback_release_full deal with the error. + */ + list_move(&failed, head); + return -EIO; + } + return 0; +} + /* * Called by non rpc-based layout drivers */ @@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data) pnfs_set_layoutcommit(data); data->mds_ops->rpc_call_done(&data->task, data); } else { - put_lseg(data->lseg); - data->lseg = NULL; dprintk("pnfs write error = %d\n", data->pnfs_error); + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + /* Don't lo_commit on error, Server will needs to + * preform a file recovery. + */ + clear_bit(NFS_INO_LAYOUTCOMMIT, + &NFS_I(data->inode)->flags); + pnfs_return_layout(data->inode); + } + data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages); } data->mds_ops->rpc_release(data); } @@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) put_lseg(data->lseg); data->lseg = NULL; dprintk("pnfs write error = %d\n", data->pnfs_error); + if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) + pnfs_return_layout(data->inode); nfs_pageio_init_read_mds(&pgio, data->inode); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1509530cb11..53d593a0a4f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -68,6 +68,7 @@ enum { enum layoutdriver_policy_flags { /* Should the pNFS client commit and return the layout upon a setattr */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, }; struct nfs4_deviceid_node; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f48125da198..0c672588fe5 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,6 +41,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/lockd/bind.h> +#include <linux/freezer.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -59,7 +60,7 @@ nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EKEYEXPIRED) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 134777406ee..3dfa4f112c0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -41,7 +41,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/mnt_namespace.h> #include <linux/namei.h> #include <linux/nfs_idmap.h> #include <linux/vfs.h> @@ -263,10 +262,10 @@ static match_table_t nfs_local_lock_tokens = { static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); -static int nfs_show_options(struct seq_file *, struct vfsmount *); -static int nfs_show_devname(struct seq_file *, struct vfsmount *); -static int nfs_show_path(struct seq_file *, struct vfsmount *); -static int nfs_show_stats(struct seq_file *, struct vfsmount *); +static int nfs_show_options(struct seq_file *, struct dentry *); +static int nfs_show_devname(struct seq_file *, struct dentry *); +static int nfs_show_path(struct seq_file *, struct dentry *); +static int nfs_show_stats(struct seq_file *, struct dentry *); static struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, @@ -721,9 +720,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, /* * Describe the mount options on this VFS mountpoint */ -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_options(struct seq_file *m, struct dentry *root) { - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); nfs_show_mount_options(m, nfss, 0); @@ -761,14 +760,14 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} #endif #endif -static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_devname(struct seq_file *m, struct dentry *root) { char *page = (char *) __get_free_page(GFP_KERNEL); char *devname, *dummy; int err = 0; if (!page) return -ENOMEM; - devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE); + devname = nfs_path(&dummy, root, page, PAGE_SIZE); if (IS_ERR(devname)) err = PTR_ERR(devname); else @@ -777,7 +776,7 @@ static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) return err; } -static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_path(struct seq_file *m, struct dentry *dentry) { seq_puts(m, "/"); return 0; @@ -786,10 +785,10 @@ static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt) /* * Present statistical information for this VFS mountpoint */ -static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +static int nfs_show_stats(struct seq_file *m, struct dentry *root) { int i, cpu; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); struct rpc_auth *auth = nfss->client->cl_auth; struct nfs_iostats totals = { }; @@ -799,10 +798,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) * Display all mount option settings */ seq_printf(m, "\n\topts:\t"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); @@ -909,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data->auth_flavor_len = 1; data->version = version; data->minorversion = 0; + security_init_mnt_opts(&data->lsm_opts); } return data; } +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); + } +} + /* * Sanity-check a server address provided by the mount command. * @@ -2220,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Validate the mount data */ error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); @@ -2234,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, #ifdef CONFIG_NFS_V4 if (data->version == 4) { mntroot = nfs4_try_mount(flags, dev_name, data); - kfree(data->client_address); - kfree(data->nfs_server.export_path); goto out; } #endif /* CONFIG_NFS_V4 */ @@ -2290,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; out: - kfree(data->nfs_server.hostname); - kfree(data->mount_server.hostname); - kfree(data->fscache_uniq); - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: + nfs_free_parsed_mount_data(data); nfs_free_fhandle(mntfh); - kfree(data); return mntroot; out_err_nosb: @@ -2623,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) - goto out_free_fh; - - security_init_mnt_opts(&data->lsm_opts); + goto out; /* Get a volume representation */ server = nfs4_create_server(data, mntfh); @@ -2677,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; - security_free_mnt_opts(&data->lsm_opts); nfs_free_fhandle(mntfh); return mntroot; out: - security_free_mnt_opts(&data->lsm_opts); -out_free_fh: nfs_free_fhandle(mntfh); return ERR_PTR(error); @@ -2788,11 +2787,15 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { struct dentry *dentry; - int ret = nfs_referral_loop_protect(); + int err; + + if (IS_ERR(root_mnt)) + return ERR_CAST(root_mnt); - if (ret) { + err = nfs_referral_loop_protect(); + if (err) { mntput(root_mnt); - return ERR_PTR(ret); + return ERR_PTR(err); } dentry = mount_subtree(root_mnt, export_path); @@ -2816,9 +2819,7 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name, data->nfs_server.hostname); data->nfs_server.export_path = export_path; - res = ERR_CAST(root_mnt); - if (!IS_ERR(root_mnt)) - res = nfs_follow_remote_path(root_mnt, export_path); + res = nfs_follow_remote_path(root_mnt, export_path); dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", IS_ERR(res) ? PTR_ERR(res) : 0, @@ -2838,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, data = nfs_alloc_parsed_mount_data(4); if (data == NULL) - goto out_free_data; + goto out; /* Validate the mount data */ error = nfs4_validate_mount_data(raw_data, data, dev_name); @@ -2852,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type, error = PTR_ERR(res); out: - kfree(data->client_address); - kfree(data->nfs_server.export_path); - kfree(data->nfs_server.hostname); - kfree(data->fscache_uniq); -out_free_data: - kfree(data); + nfs_free_parsed_mount_data(data); dprintk("<-- nfs4_mount() = %d%s\n", error, error != 0 ? " [error]" : ""); return res; @@ -3079,9 +3075,7 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, flags, data, data->hostname); data->mnt_path = export_path; - res = ERR_CAST(root_mnt); - if (!IS_ERR(root_mnt)) - res = nfs_follow_remote_path(root_mnt, export_path); + res = nfs_follow_remote_path(root_mnt, export_path); dprintk("<-- nfs4_referral_mount() = %ld%s\n", IS_ERR(res) ? PTR_ERR(res) : 0, IS_ERR(res) ? " [error]" : ""); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1dda78db6a7..834f0fe96f8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_doio = nfs_generic_pg_writepages, }; -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, @@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) static void nfs_writeback_release_full(void *calldata) { struct nfs_write_data *data = calldata; - int ret, status = data->task.tk_status; - struct nfs_pageio_descriptor pgio; - - if (data->pnfs_error) { - nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE); - pgio.pg_recoalesce = 1; - } + int status = data->task.tk_status; /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { @@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (data->pnfs_error) { - dprintk(", pnfs error = %d\n", data->pnfs_error); - goto next; - } - if (status < 0) { nfs_set_pageerror(page); nfs_context_set_write_error(req->wb_context, status); @@ -1212,19 +1201,7 @@ remove_request: next: nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); - if (data->pnfs_error) { - lock_page(page); - nfs_pageio_cond_complete(&pgio, page->index); - ret = nfs_page_async_flush(&pgio, page, 0); - if (ret) { - nfs_set_pageerror(page); - dprintk("rewrite to MDS error = %d\n", ret); - } - unlock_page(page); - } } - if (data->pnfs_error) - nfs_pageio_complete(&pgio); nfs_writedata_release(calldata); } @@ -1711,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 10e6366608f..8df1ea4a6ff 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -80,3 +80,13 @@ config NFSD_V4 available from http://linux-nfs.org/. If unsure, say N. + +config NFSD_FAULT_INJECTION + bool "NFS server manual fault injection" + depends on NFSD_V4 && DEBUG_KERNEL + help + This option enables support for manually injecting faults + into the NFS server. This is intended to be used for + testing error recovery on the NFS client. + + If unsure, say N. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 9b118ee2019..af32ef06b4f 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o +nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 62f3b9074e8..cf8a6bd062f 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) struct svc_expkey key; struct svc_expkey *ek = NULL; - if (mesg[mlen-1] != '\n') + if (mlen < 1 || mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; @@ -1226,12 +1226,12 @@ nfsd_export_init(void) int rv; dprintk("nfsd: initializing export module.\n"); - rv = cache_register(&svc_export_cache); + rv = cache_register_net(&svc_export_cache, &init_net); if (rv) return rv; - rv = cache_register(&svc_expkey_cache); + rv = cache_register_net(&svc_expkey_cache, &init_net); if (rv) - cache_unregister(&svc_export_cache); + cache_unregister_net(&svc_export_cache, &init_net); return rv; } @@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void) dprintk("nfsd: shutting down export module.\n"); - cache_unregister(&svc_expkey_cache); - cache_unregister(&svc_export_cache); + cache_unregister_net(&svc_expkey_cache, &init_net); + cache_unregister_net(&svc_export_cache, &init_net); svcauth_unix_purge(); dprintk("nfsd: export shutdown complete.\n"); diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c new file mode 100644 index 00000000000..ce7f0758d84 --- /dev/null +++ b/fs/nfsd/fault_inject.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> + * + * Uses debugfs to create fault injection points for client testing + */ + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/module.h> + +#include "state.h" +#include "fault_inject.h" + +struct nfsd_fault_inject_op { + char *file; + void (*func)(u64); +}; + +static struct nfsd_fault_inject_op inject_ops[] = { + { + .file = "forget_clients", + .func = nfsd_forget_clients, + }, + { + .file = "forget_locks", + .func = nfsd_forget_locks, + }, + { + .file = "forget_openowners", + .func = nfsd_forget_openowners, + }, + { + .file = "forget_delegations", + .func = nfsd_forget_delegations, + }, + { + .file = "recall_delegations", + .func = nfsd_recall_delegations, + }, +}; + +static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); +static struct dentry *debug_dir; + +static int nfsd_inject_set(void *op_ptr, u64 val) +{ + struct nfsd_fault_inject_op *op = op_ptr; + + if (val == 0) + printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); + else + printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); + + op->func(val); + return 0; +} + +static int nfsd_inject_get(void *data, u64 *val) +{ + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); + +void nfsd_fault_inject_cleanup(void) +{ + debugfs_remove_recursive(debug_dir); +} + +int nfsd_fault_inject_init(void) +{ + unsigned int i; + struct nfsd_fault_inject_op *op; + mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + + debug_dir = debugfs_create_dir("nfsd", NULL); + if (!debug_dir) + goto fail; + + for (i = 0; i < NUM_INJECT_OPS; i++) { + op = &inject_ops[i]; + if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) + goto fail; + } + return 0; + +fail: + nfsd_fault_inject_cleanup(); + return -ENOMEM; +} diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h new file mode 100644 index 00000000000..90bd0570956 --- /dev/null +++ b/fs/nfsd/fault_inject.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> + * + * Function definitions for fault injection + */ + +#ifndef LINUX_NFSD_FAULT_INJECT_H +#define LINUX_NFSD_FAULT_INJECT_H + +#ifdef CONFIG_NFSD_FAULT_INJECTION +int nfsd_fault_inject_init(void); +void nfsd_fault_inject_cleanup(void); +void nfsd_forget_clients(u64); +void nfsd_forget_locks(u64); +void nfsd_forget_openowners(u64); +void nfsd_forget_delegations(u64); +void nfsd_recall_delegations(u64); +#else /* CONFIG_NFSD_FAULT_INJECTION */ +static inline int nfsd_fault_inject_init(void) { return 0; } +static inline void nfsd_fault_inject_cleanup(void) {} +static inline void nfsd_forget_clients(u64 num) {} +static inline void nfsd_forget_locks(u64 num) {} +static inline void nfsd_forget_openowners(u64 num) {} +static inline void nfsd_forget_delegations(u64 num) {} +static inline void nfsd_recall_delegations(u64 num) {} +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + +#endif /* LINUX_NFSD_FAULT_INJECT_H */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7748d6a18d9..6f3ebb48b12 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -718,7 +718,7 @@ int set_callback_cred(void) { if (callback_cred) return 0; - callback_cred = rpc_lookup_machine_cred(); + callback_cred = rpc_lookup_machine_cred("nfs"); if (!callback_cred) return -ENOMEM; return 0; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 55780a22fdb..94096273cd6 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -36,6 +36,7 @@ #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> +#include <net/net_namespace.h> #include "idmap.h" #include "nfsd.h" @@ -466,20 +467,20 @@ nfsd_idmap_init(void) { int rv; - rv = cache_register(&idtoname_cache); + rv = cache_register_net(&idtoname_cache, &init_net); if (rv) return rv; - rv = cache_register(&nametoid_cache); + rv = cache_register_net(&nametoid_cache, &init_net); if (rv) - cache_unregister(&idtoname_cache); + cache_unregister_net(&idtoname_cache, &init_net); return rv; } void nfsd_idmap_shutdown(void) { - cache_unregister(&idtoname_cache); - cache_unregister(&nametoid_cache); + cache_unregister_net(&idtoname_cache, &init_net); + cache_unregister_net(&nametoid_cache, &init_net); } static int diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index fa383361bc6..896da74ec56 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ { __be32 status; - /* Only reclaims from previously confirmed clients are valid */ - if ((status = nfs4_check_open_reclaim(&open->op_clientid))) - return status; - /* We don't know the target directory, and therefore can not * set the change info */ @@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_PREVIOUS: open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + status = nfs4_check_open_reclaim(&open->op_clientid); + if (status) + goto out; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, &cstate->current_fh, @@ -838,7 +837,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } - status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + status = fh_want_write(&cstate->current_fh); if (status) return status; status = nfs_ok; @@ -856,7 +855,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); out: - mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt); + fh_drop_write(&cstate->current_fh); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ed083b9a731..0b3e875d1ab 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -117,8 +117,7 @@ out_no_tfm: return status; } -int -nfsd4_create_clid_dir(struct nfs4_client *clp) +void nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; @@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (!rec_file || clp->cl_firststate) - return 0; - + if (clp->cl_firststate) + return; clp->cl_firststate = 1; + if (!rec_file) + return; status = nfs4_save_creds(&original_cred); if (status < 0) - return status; + return; dir = rec_file->f_path.dentry; /* lock the parent */ @@ -144,14 +144,21 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) status = PTR_ERR(dentry); goto out_unlock; } - status = -EEXIST; if (dentry->d_inode) + /* + * In the 4.1 case, where we're called from + * reclaim_complete(), records from the previous reboot + * may still be left, so this is OK. + * + * In the 4.0 case, we should never get here; but we may + * as well be forgiving and just succeed silently. + */ goto out_put; - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out_put; status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out_put: dput(dentry); out_unlock: @@ -164,7 +171,6 @@ out_unlock: " and is writeable", status, user_recovery_dirname); nfs4_reset_creds(original_cred); - return status; } typedef int (recdir_func)(struct dentry *, struct dentry *); @@ -268,7 +274,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!rec_file || !clp->cl_firststate) return; - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out; clp->cl_firststate = 0; @@ -281,7 +287,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) nfs4_reset_creds(original_cred); if (status == 0) vfs_fsync(rec_file, 0); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -311,13 +317,13 @@ nfsd4_recdir_purge_old(void) { if (!rec_file) return; - status = mnt_want_write(rec_file->f_path.mnt); + status = mnt_want_write_file(rec_file); if (status) goto out; status = nfsd4_list_rec_dir(purge_old); if (status == 0) vfs_fsync(rec_file, 0); - mnt_drop_write(rec_file->f_path.mnt); + mnt_drop_write_file(rec_file); out: if (status) printk("nfsd4: failed to purge old clients from recovery" diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47e94e33a97..e8c98f00967 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -49,12 +49,20 @@ time_t nfsd4_lease = 90; /* default lease time */ time_t nfsd4_grace = 90; static time_t boot_time; -static stateid_t zerostateid; /* bits all 0 */ -static stateid_t onestateid; /* bits all 1 */ + +#define all_ones {{~0,~0},~0} +static const stateid_t one_stateid = { + .si_generation = ~0, + .si_opaque = all_ones, +}; +static const stateid_t zero_stateid = { + /* all fields zero */ +}; + static u64 current_sessionid = 1; -#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) -#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) /* forward declarations */ static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); @@ -133,21 +141,21 @@ unsigned int max_delegations; * Open owner state (share locks) */ -/* hash tables for open owners */ -#define OPEN_OWNER_HASH_BITS 8 -#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) -#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) +/* hash tables for lock and open owners */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) -static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) +static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) { unsigned int ret; ret = opaque_hashval(ownername->data, ownername->len); ret += clientid; - return ret & OPEN_OWNER_HASH_MASK; + return ret & OWNER_HASH_MASK; } -static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; /* hash table for nfs4_file */ #define FILE_HASH_BITS 8 @@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) list_del(&lo->lo_owner.so_strhash); list_del(&lo->lo_perstateid); + list_del(&lo->lo_owner_ino_hash); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); @@ -658,7 +667,7 @@ static int nfsd4_sanitize_slot_size(u32 size) /* * XXX: If we run out of reserved DRC memory we could (up to a point) * re-negotiate active sessions and reduce their slot usage to make - * rooom for new connections. For now we just fail the create session. + * room for new connections. For now we just fail the create session. */ static int nfsd4_get_drc_mem(int slotsize, u32 num) { @@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); if (clp == NULL) return NULL; - clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); + clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); if (clp->cl_name.data == NULL) { kfree(clp); return NULL; } - memcpy(clp->cl_name.data, name.data, name.len); clp->cl_name.len = name.len; return clp; } @@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp) spin_unlock(&recall_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { @@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void) nfsd4_free_slab(&deleg_slab); } -static int +int nfsd4_init_slabs(void) { openowner_slab = kmem_cache_create("nfsd4_openowners", @@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) { - list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); + list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); list_add(&oo->oo_perclient, &clp->cl_openowners); } @@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open) struct nfs4_stateowner *so; struct nfs4_openowner *oo; - list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (!so->so_is_open_owner) + continue; if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { oo = openowner(so); renew_client(oo->oo_owner.so_client); @@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); + strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); oo = find_openstateowner_str(strhashval, open); open->op_openowner = oo; if (!oo) { @@ -3123,7 +3132,6 @@ nfs4_laundromat(void) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } test_val = nfsd4_lease; @@ -3718,13 +3726,11 @@ out: } -/* - * Lock owner state (byte-range locks) - */ #define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) -#define LOCK_HASH_BITS 8 -#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) -#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +#define LOCKOWNER_INO_HASH_BITS 8 +#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS) +#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) static inline u64 end_offset(u64 start, u64 len) @@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1: NFS4_MAX_UINT64; } -static inline unsigned int -lock_ownerstr_hashval(struct inode *inode, u32 cl_id, - struct xdr_netobj *ownername) +static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername) { return (file_hashval(inode) + cl_id + opaque_hashval(ownername->data, ownername->len)) - & LOCK_HASH_MASK; + & LOCKOWNER_INO_HASH_MASK; } -static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; +static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE]; /* * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that @@ -3809,23 +3813,39 @@ nevermind: deny->ld_type = NFS4_WRITE_LT; } +static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) +{ + struct nfs4_ol_stateid *lst; + + if (!same_owner_str(&lo->lo_owner, owner, clid)) + return false; + lst = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + return lst->st_file->fi_inode == inode; +} + static struct nfs4_lockowner * find_lockowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { - unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); - struct nfs4_stateowner *op; + unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); + struct nfs4_lockowner *lo; - list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { - if (same_owner_str(op, owner, clid)) - return lockowner(op); + list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { + if (same_lockowner_ino(lo, inode, clid, owner)) + return lo; } return NULL; } static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) { - list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); + struct inode *inode = open_stp->st_file->fi_inode; + unsigned int inohash = lockowner_ino_hashval(inode, + clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); + + list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); + list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); list_add(&lo->lo_perstateid, &open_stp->st_lockowners); } @@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has * occurred. * - * strhashval = lock_ownerstr_hashval + * strhashval = ownerstr_hashval */ static struct nfs4_lockowner * @@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) __set_bit(access, &lock_stp->st_access_bmap); } +__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) +{ + struct nfs4_file *fi = ost->st_file; + struct nfs4_openowner *oo = openowner(ost->st_stateowner); + struct nfs4_client *cl = oo->oo_owner.so_client; + struct nfs4_lockowner *lo; + unsigned int strhashval; + + lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); + if (lo) { + if (!cstate->minorversion) + return nfserr_bad_seqid; + /* XXX: a lockowner always has exactly one stateid: */ + *lst = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, st_perstateowner); + return nfs_ok; + } + strhashval = ownerstr_hashval(cl->cl_clientid.cl_id, + &lock->v.new.owner); + lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock); + if (lo == NULL) + return nfserr_jukebox; + *lst = alloc_init_lock_stateid(lo, fi, ost); + if (*lst == NULL) { + release_lockowner(lo); + return nfserr_jukebox; + } + *new = true; + return nfs_ok; +} + /* * LOCK operation */ @@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; - unsigned int strhashval; + bool new_state = false; int lkflg; int err; @@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * lock stateid. */ struct nfs4_ol_stateid *open_stp = NULL; - + + if (nfsd4_has_session(cstate)) + /* See rfc 5661 18.10.3: given clientid is ignored: */ + memcpy(&lock->v.new.clientid, + &cstate->session->se_client->cl_clientid, + sizeof(clientid_t)); + status = nfserr_stale_clientid; - if (!nfsd4_has_session(cstate) && - STALE_CLIENTID(&lock->lk_new_clientid)) + if (STALE_CLIENTID(&lock->lk_new_clientid)) goto out; /* validate and update open stateid and open seqid */ @@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; - if (!nfsd4_has_session(cstate) && - !same_clid(&open_sop->oo_owner.so_client->cl_clientid, + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, &lock->v.new.clientid)) goto out; - /* create lockowner and lock stateid */ - fp = open_stp->st_file; - strhashval = lock_ownerstr_hashval(fp->fi_inode, - open_sop->oo_owner.so_client->cl_clientid.cl_id, - &lock->v.new.owner); - /* XXX: Do we need to check for duplicate stateowners on - * the same file, or should they just be allowed (and - * create new stateids)? */ - status = nfserr_jukebox; - lock_sop = alloc_init_lock_stateowner(strhashval, - open_sop->oo_owner.so_client, open_stp, lock); - if (lock_sop == NULL) - goto out; - lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp); - if (lock_stp == NULL) + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new_state); + if (status) goto out; } else { /* lock (lock owner + lock stateid) already exists */ @@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, NFS4_LOCK_STID, &lock_stp); if (status) goto out; - lock_sop = lockowner(lock_stp->st_stateowner); - fp = lock_stp->st_file; } - /* lock_sop and lock_stp have been created or found */ + lock_sop = lockowner(lock_stp->st_stateowner); + fp = lock_stp->st_file; lkflg = setlkflg(lock->lk_type); status = nfs4_check_openmode(lock_stp, lkflg); @@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: - if (status && lock->lk_is_new && lock_sop) + if (status && new_state) release_lockowner(lock_sop); if (!cstate->replay_owner) nfs4_unlock_state(); @@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfs4_ol_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; struct list_head matches; - int i; + unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", @@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_lock_state(); status = nfserr_locks_held; - /* XXX: we're doing a linear search through all the lockowners. - * Yipes! For now we'll just hope clients aren't really using - * release_lockowner much, but eventually we have to fix these - * data structures. */ INIT_LIST_HEAD(&matches); - for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { - if (!same_owner_str(sop, owner, clid)) - continue; - list_for_each_entry(stp, &sop->so_stateids, - st_perstateowner) { - lo = lockowner(sop); - if (check_for_locks(stp->st_file, lo)) - goto out; - list_add(&lo->lo_list, &matches); - } + + list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { + if (sop->so_is_open_owner) + continue; + if (!same_owner_str(sop, owner, clid)) + continue; + list_for_each_entry(stp, &sop->so_stateids, + st_perstateowner) { + lo = lockowner(sop); + if (check_for_locks(stp->st_file, lo)) + goto out; + list_add(&lo->lo_list, &matches); } } /* Clients probably won't expect us to return with some (but not all) @@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid) return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; } +#ifdef CONFIG_NFSD_FAULT_INJECTION + +void nfsd_forget_clients(u64 num) +{ + struct nfs4_client *clp, *next; + int count = 0; + + nfs4_lock_state(); + list_for_each_entry_safe(clp, next, &client_lru, cl_lru) { + nfsd4_remove_clid_dir(clp); + expire_client(clp); + if (++count == num) + break; + } + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d clients", count); +} + +static void release_lockowner_sop(struct nfs4_stateowner *sop) +{ + release_lockowner(lockowner(sop)); +} + +static void release_openowner_sop(struct nfs4_stateowner *sop) +{ + release_openowner(openowner(sop)); +} + +static int nfsd_release_n_owners(u64 num, bool is_open_owner, + void (*release_sop)(struct nfs4_stateowner *)) +{ + int i, count = 0; + struct nfs4_stateowner *sop, *next; + + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { + if (sop->so_is_open_owner != is_open_owner) + continue; + release_sop(sop); + if (++count == num) + return count; + } + } + return count; +} + +void nfsd_forget_locks(u64 num) +{ + int count; + + nfs4_lock_state(); + count = nfsd_release_n_owners(num, false, release_lockowner_sop); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d locks", count); +} + +void nfsd_forget_openowners(u64 num) +{ + int count; + + nfs4_lock_state(); + count = nfsd_release_n_owners(num, true, release_openowner_sop); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d open owners", count); +} + +int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *)) +{ + int i, count = 0; + struct nfs4_file *fp, *fnext; + struct nfs4_delegation *dp, *dnext; + + for (i = 0; i < FILE_HASH_SIZE; i++) { + list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { + list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { + deleg_func(dp); + if (++count == num) + return count; + } + } + } + + return count; +} + +void nfsd_forget_delegations(u64 num) +{ + unsigned int count; + + nfs4_lock_state(); + count = nfsd_process_n_delegations(num, unhash_delegation); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Forgot %d delegations", count); +} + +void nfsd_recall_delegations(u64 num) +{ + unsigned int count; + + nfs4_lock_state(); + spin_lock(&recall_lock); + count = nfsd_process_n_delegations(num, nfsd_break_one_deleg); + spin_unlock(&recall_lock); + nfs4_unlock_state(); + + printk(KERN_INFO "NFSD: Recalled %d delegations", count); +} + +#endif /* CONFIG_NFSD_FAULT_INJECTION */ + /* initialization to perform at module load time: */ -int +void nfs4_state_init(void) { - int i, status; + int i; - status = nfsd4_init_slabs(); - if (status) - return status; for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&conf_str_hashtbl[i]); @@ -4416,18 +4566,15 @@ nfs4_state_init(void) for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } - for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { - INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); - } - for (i = 0; i < LOCK_HASH_SIZE; i++) { - INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + INIT_LIST_HEAD(&ownerstr_hashtbl[i]); } - memset(&onestateid, ~0, sizeof(stateid_t)); + for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++) + INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]); INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); reclaim_str_hashtbl_size = 0; - return 0; } static void @@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b6fa792d6b8..0ec5a1b9700 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp, static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) { if (p == argp->tmp) { - p = kmalloc(nbytes, GFP_KERNEL); + p = kmemdup(argp->tmp, nbytes, GFP_KERNEL); if (!p) return NULL; - memcpy(p, argp->tmp, nbytes); } else { BUG_ON(p != argp->tmpp); argp->tmpp = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c45a2ea4a09..748eda93ce5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -18,6 +18,7 @@ #include "idmap.h" #include "nfsd.h" #include "cache.h" +#include "fault_inject.h" /* * We have a single directory with several nodes in it. @@ -272,7 +273,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 2. Is that directory a mount point, or * 3. Is that directory the root of an exported file system? */ - error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); + error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); path_put(&path); return error; @@ -1128,9 +1129,13 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = nfs4_state_init(); /* nfs4 locking state */ + retval = nfsd4_init_slabs(); if (retval) return retval; + nfs4_state_init(); + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + if (retval) + goto out_free_slabs; nfsd_stat_init(); /* Statistics */ retval = nfsd_reply_cache_init(); if (retval) @@ -1161,6 +1166,8 @@ out_free_cache: nfsd_reply_cache_shutdown(); out_free_stat: nfsd_stat_shutdown(); + nfsd_fault_inject_cleanup(); +out_free_slabs: nfsd4_free_slabs(); return retval; } @@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd_idmap_shutdown(); nfsd4_free_slabs(); + nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 58134a23fdf..1d1e8589b4c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq) */ #ifdef CONFIG_NFSD_V4 extern unsigned int max_delegations; -int nfs4_state_init(void); +void nfs4_state_init(void); +int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); void nfs4_state_shutdown(void); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); #else -static inline int nfs4_state_init(void) { return 0; } +static inline void nfs4_state_init(void) { } +static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } static inline void nfs4_state_shutdown(void) { } @@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion) } /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ -#define NFSD_WRITEONLY_ATTRS_WORD1 \ -(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEONLY_ATTRS_WORD1 \ + (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ -#define NFSD_WRITEABLE_ATTRS_WORD0 \ -(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) -#define NFSD_WRITEABLE_ATTRS_WORD1 \ -(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD0 \ + (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) #define NFSD_WRITEABLE_ATTRS_WORD2 0 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c763de5c115..68454e75fce 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,7 +59,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested) +nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) { mode &= S_IFMT; @@ -293,7 +293,7 @@ out: * include/linux/nfsd/nfsd.h. */ __be32 -fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { struct svc_export *exp; struct dentry *dentry; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index c16f8d8331b..e5e6707ba68 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -102,7 +102,7 @@ extern char * SVCFH_fmt(struct svc_fh *fhp); /* * Function prototypes */ -__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); __be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index a3cf38476a1..ffb5df1db94 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -366,6 +366,7 @@ struct nfs4_openowner { struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_owner_ino_hash; /* hash by owner,file */ struct list_head lo_perstateid; /* for lockowners only */ struct list_head lo_list; /* for temporary uses */ }; @@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void); extern int nfs4_client_to_reclaim(const char *name); extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); extern void nfsd4_recdir_purge_old(void); -extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7a2e442623c..edf6d3ed877 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -307,7 +307,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, struct dentry *dentry; struct inode *inode; int accmode = NFSD_MAY_SATTR; - int ftype = 0; + umode_t ftype = 0; __be32 err; int host_err; int size_change = 0; @@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } -#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." -#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" +/* + * NFS junction information is stored in an extended attribute. + */ +#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs" + +/** + * nfsd4_is_junction - Test if an object could be an NFS junction + * + * @dentry: object to test + * + * Returns 1 if "dentry" appears to contain NFS junction information. + * Otherwise 0 is returned. + */ int nfsd4_is_junction(struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) + if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) return 0; return 1; } @@ -730,7 +741,7 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access, struct file **filp) { struct dentry *dentry; @@ -1300,7 +1311,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; @@ -1325,7 +1336,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, break; } if (host_err < 0) { - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out_nfserr; } @@ -1339,7 +1350,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); /* * Update the file handle to get the new inode info. */ @@ -1430,7 +1441,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; if (dchild->d_inode) { @@ -1469,13 +1480,13 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_GUARDED: err = nfserr_exist; } - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out; } host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); if (host_err < 0) { - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); goto out_nfserr; } if (created) @@ -1503,7 +1514,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = nfserrno(commit_metadata(fhp)); - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); /* * Update the filehandle to get the new inode info. */ @@ -1600,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; @@ -1621,7 +1632,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserrno(commit_metadata(fhp)); fh_unlock(fhp); - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); dput(dnew); @@ -1674,7 +1685,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; - host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); + host_err = fh_want_write(tfhp); if (host_err) { err = nfserrno(host_err); goto out_dput; @@ -1699,7 +1710,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserrno(host_err); } out_drop_write: - mnt_drop_write(tfhp->fh_export->ex_path.mnt); + fh_drop_write(tfhp); out_dput: dput(dnew); out_unlock: @@ -1776,7 +1787,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; - host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt); + host_err = fh_want_write(ffhp); if (host_err) goto out_dput_new; @@ -1795,7 +1806,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = commit_metadata(ffhp); } out_drop_write: - mnt_drop_write(ffhp->fh_export->ex_path.mnt); + fh_drop_write(ffhp); out_dput_new: dput(ndentry); out_dput_old: @@ -1854,7 +1865,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + host_err = fh_want_write(fhp); if (host_err) goto out_put; @@ -1868,7 +1879,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!host_err) host_err = commit_metadata(fhp); out_drop_write: - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); out_put: dput(rdentry); @@ -2270,7 +2281,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) } else size = 0; - error = mnt_want_write(fhp->fh_export->ex_path.mnt); + error = fh_want_write(fhp); if (error) goto getout; if (size) @@ -2284,7 +2295,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) error = 0; } } - mnt_drop_write(fhp->fh_export->ex_path.mnt); + fh_drop_write(fhp); getout: kfree(value); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3f54ad03bb2..1dcd238e11a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -66,7 +66,7 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ -__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); void nfsd_close(struct file *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, @@ -106,4 +106,14 @@ struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); #endif +static inline int fh_want_write(struct svc_fh *fh) +{ + return mnt_want_write(fh->fh_export->ex_path.mnt); +} + +static inline void fh_drop_write(struct svc_fh *fh) +{ + mnt_drop_write(fh->fh_export->ex_path.mnt); +} + #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 3a1923943b1..ca35b3a46d1 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -251,7 +251,7 @@ nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b50ffb72e5b..8f7b95ac1f7 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -291,7 +291,7 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; -struct inode *nilfs_new_inode(struct inode *dir, int mode) +struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct the_nilfs *nilfs = sb->s_fs_info; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index ac258beeda3..2a70fce70c6 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -27,7 +27,7 @@ #include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ #include <linux/vmalloc.h> #include <linux/compat.h> /* compat_ptr() */ -#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */ +#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "nilfs.h" @@ -119,7 +119,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, if (get_user(flags, (int __user *)argp)) return -EFAULT; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -154,7 +154,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, ret = nilfs_transaction_commit(inode->i_sb); out: mutex_unlock(&inode->i_mutex); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -174,7 +174,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -194,7 +194,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, up_read(&inode->i_sb->s_umount); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -210,7 +210,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -225,7 +225,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, else nilfs_transaction_commit(inode->i_sb); /* never fails */ out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -591,7 +591,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, nsegs = argv[4].v_nmembs; if (argv[4].v_size != argsz[4]) goto out; + if (nsegs > UINT_MAX / sizeof(__u64)) + goto out; /* * argv[4] points to segment numbers this ioctl cleans. We @@ -675,7 +677,7 @@ out_free: vfree(kbufs[n]); kfree(kbufs[4]); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } @@ -710,7 +712,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, if (!capable(CAP_SYS_ADMIN)) goto out; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) goto out; @@ -721,7 +723,7 @@ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, ret = nilfs_resize_fs(inode->i_sb, newsize); out_drop_write: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); out: return ret; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 768982de10e..1cd3f624dff 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -84,7 +84,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -112,7 +112,7 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, } static int -nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct nilfs_transaction_info ti; @@ -213,7 +213,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 3777d138f89..250add84da7 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -246,7 +246,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, /* inode.c */ void nilfs_inode_add_blocks(struct inode *inode, int n); void nilfs_inode_sub_blocks(struct inode *inode, int n); -extern struct inode *nilfs_new_inode(struct inode *, int); +extern struct inode *nilfs_new_inode(struct inode *, umode_t); extern void nilfs_free_inode(struct inode *); extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern void nilfs_set_inode_flags(struct inode *); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index bb24ab6c282..0e72ad6f22a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2470,7 +2470,7 @@ static int nilfs_segctor_thread(void *arg) if (freezing(current)) { spin_unlock(&sci->sc_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&sci->sc_state_lock); } else { DEFINE_WAIT(wait); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8351c44a732..08e3d4f9df1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -175,8 +175,6 @@ static void nilfs_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct nilfs_mdt_info *mdi = NILFS_MDT(inode); - INIT_LIST_HEAD(&inode->i_dentry); - if (mdi) { kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); @@ -650,11 +648,11 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { - struct super_block *sb = vfs->mnt_sb; + struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d3271409437..501b7f8b739 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -409,6 +409,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); nilfs->ns_r_segments_percentage = le32_to_cpu(sbp->s_r_segments_percentage); + if (nilfs->ns_r_segments_percentage < 1 || + nilfs->ns_r_segments_percentage > 99) { + printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); + return -EINVAL; + } + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; @@ -515,6 +521,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, brelse(sbh[1]); sbh[1] = NULL; sbp[1] = NULL; + valid[1] = 0; swp = 0; } if (!valid[swp]) { diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 44a88a9fa2c..fea6bd5831d 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] = #define SURROGATE_LOW 0x00000400 #define SURROGATE_BITS 0x000003ff -int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) +int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu) { unsigned long l; int c0, c, nc; @@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) *pu = (unicode_t) l; return nc; } - if (len <= nc) + if (inlen <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) } EXPORT_SYMBOL(utf8_to_utf32); -int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) +int utf32_to_utf8(unicode_t u, u8 *s, int maxout) { unsigned long l; int c, nc; @@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) return -1; nc = 0; - for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { + for (t = utf8_table; t->cmask && maxout; t++, maxout--) { nc++; if (l <= t->lmask) { c = t->shift; @@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) } EXPORT_SYMBOL(utf32_to_utf8); -int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) +static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + *s = (wchar_t) c; + break; + case UTF16_LITTLE_ENDIAN: + *s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *s = __cpu_to_be16(c); + break; + } +} + +int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) { u16 *op; int size; unicode_t u; op = pwcs; - while (*s && len > 0) { + while (inlen > 0 && maxout > 0 && *s) { if (*s & 0x80) { - size = utf8_to_utf32(s, len, &u); + size = utf8_to_utf32(s, inlen, &u); if (size < 0) return -EINVAL; + s += size; + inlen -= size; if (u >= PLANE_SIZE) { + if (maxout < 2) + break; u -= PLANE_SIZE; - *op++ = (wchar_t) (SURROGATE_PAIR | - ((u >> 10) & SURROGATE_BITS)); - *op++ = (wchar_t) (SURROGATE_PAIR | + put_utf16(op++, SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, SURROGATE_PAIR | SURROGATE_LOW | - (u & SURROGATE_BITS)); + (u & SURROGATE_BITS), + endian); + maxout -= 2; } else { - *op++ = (wchar_t) u; + put_utf16(op++, u, endian); + maxout--; } - s += size; - len -= size; } else { - *op++ = *s++; - len--; + put_utf16(op++, *s++, endian); + inlen--; + maxout--; } } return op - pwcs; @@ -160,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) } } -int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, - u8 *s, int maxlen) +int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, + u8 *s, int maxout) { u8 *op; int size; unsigned long u, v; op = s; - while (len > 0 && maxlen > 0) { + while (inlen > 0 && maxout > 0) { u = get_utf16(*pwcs, endian); if (!u) break; pwcs++; - len--; + inlen--; if (u > 0x7f) { if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { if (u & SURROGATE_LOW) { /* Ignore character and move on */ continue; } - if (len <= 0) + if (inlen <= 0) break; v = get_utf16(*pwcs, endian); if ((v & SURROGATE_MASK) != SURROGATE_PAIR || @@ -191,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + (v & SURROGATE_BITS); pwcs++; - len--; + inlen--; } - size = utf32_to_utf8(u, op, maxlen); + size = utf32_to_utf8(u, op, maxout); if (size == -1) { /* Ignore character and move on */ } else { op += size; - maxlen -= size; + maxout -= size; } } else { *op++ = (u8) u; - maxlen--; + maxout--; } } return op - s; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9fde1c00a29..3568c8a8b13 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,8 @@ #include <asm/ioctls.h> +#include "../../mount.h" + #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 @@ -546,7 +548,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); fsnotify_put_mark(fsn_mark); - if (removed & mnt->mnt_fsnotify_mask) + if (removed & real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); return 0; @@ -623,7 +625,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - if (added & ~mnt->mnt_fsnotify_mask) + if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); err: fsnotify_put_mark(fsn_mark); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 79b47cbb5cd..ccb14d3fc0d 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,6 +26,7 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../mount.h" /* * Clear all of the marks on an inode when it is being evicted from core @@ -205,13 +206,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; struct fsnotify_event *event = NULL; - struct vfsmount *mnt; + struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (data_is == FSNOTIFY_EVENT_PATH) - mnt = ((struct path *)data)->mnt; + mnt = real_mount(((struct path *)data)->mnt); else mnt = NULL; @@ -262,11 +263,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, + ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); inode_group = NULL; } else { - ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, + ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index e14587d5568..f104d565b68 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; - /* 1 from caller and 1 for being on i_list/g_list */ - BUG_ON(atomic_read(&mark->refcnt) < 2); - spin_lock(&group->mark_lock); if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { @@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark) iput(inode); /* + * We don't necessarily have a ref on mark from caller so the above iput + * may have already destroyed it. Don't touch from now on. + */ + + /* * it's possible that this group tried to destroy itself, but this * this mark was simultaneously being freed by inode. If that's the * case, we finish freeing the group here. diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 778fe6cae3b..b7b4b0e8554 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -28,15 +28,17 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../mount.h" void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { struct fsnotify_mark *mark, *lmark; struct hlist_node *pos, *n; + struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) { list_add(&mark->m.free_m_list, &free_list); hlist_del_init_rcu(&mark->m.m_list); fsnotify_get_mark(mark); @@ -59,15 +61,16 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) */ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; struct hlist_node *pos; __u32 new_mask = 0; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) + hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) new_mask |= mark->mask; - mnt->mnt_fsnotify_mask = new_mask; + m->mnt_fsnotify_mask = new_mask; } /* @@ -101,12 +104,13 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; struct hlist_node *pos; assert_spin_locked(&mnt->mnt_root->d_lock); - hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) { if (mark->group == group) { fsnotify_get_mark(mark); return mark; @@ -140,6 +144,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct vfsmount *mnt, int allow_dups) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *lmark; struct hlist_node *node, *last = NULL; int ret = 0; @@ -154,13 +159,13 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, mark->m.mnt = mnt; /* is mark the first mark? */ - if (hlist_empty(&mnt->mnt_fsnotify_marks)) { - hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks); + if (hlist_empty(&m->mnt_fsnotify_marks)) { + hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); goto out; } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) { + hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) { last = node; if ((lmark->group == group) && !allow_dups) { diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index f14fde2b03d..e0281992ddc 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,7 +1,7 @@ /** * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -345,10 +345,10 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, unsigned long flags; bool is_retry = false; + BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", ni->mft_no, (unsigned long long)vcn, write_locked ? "write" : "read"); - BUG_ON(!ni); BUG_ON(!NInoNonResident(ni)); BUG_ON(vcn < 0); if (!ni->runlist.rl) { @@ -469,9 +469,9 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, int err = 0; bool is_retry = false; + BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!ni); BUG_ON(!NInoNonResident(ni)); BUG_ON(vcn < 0); if (!ni->runlist.rl) { diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 97e2dacbc86..2eaa6665294 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -335,7 +335,6 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) static void ntfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); } @@ -2301,16 +2300,16 @@ void ntfs_evict_big_inode(struct inode *vi) /** * ntfs_show_options - show mount options in /proc/mounts * @sf: seq_file in which to write our mount options - * @mnt: vfs mount whose mount options to display + * @root: root of the mounted tree whose mount options to display * * Called by the VFS once for each mounted ntfs volume when someone reads * /proc/mounts in order to display the NTFS specific mount options of each - * mount. The mount options of the vfs mount @mnt are written to the seq file + * mount. The mount options of fs specified by @root are written to the seq file * @sf and success is returned. */ -int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt) +int ntfs_show_options(struct seq_file *sf, struct dentry *root) { - ntfs_volume *vol = NTFS_SB(mnt->mnt_sb); + ntfs_volume *vol = NTFS_SB(root->d_sb); int i; seq_printf(sf, ",uid=%i", vol->uid); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index fe8e7e92888..db29695f845 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -298,7 +298,7 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni); extern int ntfs_read_inode_mount(struct inode *vi); -extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt); +extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); #ifdef NTFS_RW diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 382857f9c7d..3014a36a255 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft " "bitmap."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate " + ntfs_error(vol->sb, "Failed to deallocate " "allocated cluster.%s", es); NVolSetErrors(vol); } @@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft data " "attribute."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate clusters " + ntfs_error(vol->sb, "Failed to deallocate clusters " "from the mft data attribute.%s", es); NVolSetErrors(vol); } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index b52706da464..f907611cca7 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -104,7 +104,7 @@ static bool parse_options(ntfs_volume *vol, char *opt) int errors = 0, sloppy = 0; uid_t uid = (uid_t)-1; gid_t gid = (gid_t)-1; - mode_t fmask = (mode_t)-1, dmask = (mode_t)-1; + umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; int mft_zone_multiplier = -1, on_errors = -1; int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; struct nls_table *nls_map = NULL, *old_nls; @@ -287,9 +287,9 @@ no_mount_options: vol->uid = uid; if (gid != (gid_t)-1) vol->gid = gid; - if (fmask != (mode_t)-1) + if (fmask != (umode_t)-1) vol->fmask = fmask; - if (dmask != (mode_t)-1) + if (dmask != (umode_t)-1) vol->dmask = dmask; if (show_sys_files != -1) { if (show_sys_files) @@ -1239,7 +1239,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol) { MFT_REF mref; struct inode *vi; - ntfs_inode *ni; struct page *page; u32 *kaddr, *kend; ntfs_name *name = NULL; @@ -1290,7 +1289,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol) "is not the system volume.", i_size_read(vi)); goto iput_out; } - ni = NTFS_I(vi); page = ntfs_map_page(vi->i_mapping, 0); if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); @@ -3198,7 +3196,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG -module_param(debug_msgs, bool, 0); +module_param(debug_msgs, bint, 0); MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); #endif diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 406ab55dfb3..15e3ba8d521 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -48,8 +48,8 @@ typedef struct { unsigned long flags; /* Miscellaneous flags, see below. */ uid_t uid; /* uid that files will be mounted as. */ gid_t gid; /* gid that files will be mounted as. */ - mode_t fmask; /* The mask for file permissions. */ - mode_t dmask; /* The mask for directory + umode_t fmask; /* The mask for file permissions. */ + umode_t dmask; /* The mask for directory permissions. */ u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ u8 on_errors; /* What to do on filesystem errors. */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index dc45deb19e6..73ba81928bc 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -553,7 +553,7 @@ void o2net_debugfs_exit(void) int o2net_debugfs_init(void) { - mode_t mode = S_IFREG|S_IRUSR; + umode_t mode = S_IFREG|S_IRUSR; o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); if (o2net_dentry) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b4207679704..abfac0d7ae9 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -354,7 +354,6 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) static void dlmfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } @@ -401,16 +400,14 @@ static struct backing_dev_info dlmfs_backing_dev_info = { static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); - int mode = S_IFDIR | 0755; + umode_t mode = S_IFDIR | 0755; struct dlmfs_inode_private *ip; if (inode) { ip = DLMFS_I(inode); inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -424,7 +421,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) static struct inode *dlmfs_get_inode(struct inode *parent, struct dentry *dentry, - int mode) + umode_t mode) { struct super_block *sb = parent->i_sb; struct inode * inode = new_inode(sb); @@ -434,9 +431,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, parent, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -473,13 +468,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inc_nlink(inode); break; } - - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - return inode; } @@ -489,7 +477,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, /* SMP-safe */ static int dlmfs_mkdir(struct inode * dir, struct dentry * dentry, - int mode) + umode_t mode) { int status; struct inode *inode = NULL; @@ -537,7 +525,7 @@ bail: static int dlmfs_create(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { int status = 0; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6e396683c3d..061591a3ab0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2128,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * remove_suid() calls ->setattr without any hint that * we may have already done our cluster locking. Since * ocfs2_setattr() *must* take cluster locks to - * proceeed, this will lead us to recursively lock the + * proceed, this will lead us to recursively lock the * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 726ff265b29..a6fda3c188a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -906,12 +906,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - status = mnt_want_write(filp->f_path.mnt); + status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_set_inode_attr(inode, flags, OCFS2_FL_MODIFIABLE); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 184c76b8c29..b1e3fce72ea 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1059,7 +1059,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) struct ocfs2_move_extents range; struct ocfs2_move_extents_context *context = NULL; - status = mnt_want_write(filp->f_path.mnt); + status = mnt_want_write_file(filp); if (status) return status; @@ -1145,7 +1145,7 @@ out: kfree(context); - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8b2bfea574..a9856e3eaaf 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -185,7 +185,7 @@ bail: return ret; } -static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) +static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) { struct inode *inode; @@ -207,7 +207,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t dev) { int status = 0; @@ -602,7 +602,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { int ret; @@ -617,7 +617,7 @@ static int ocfs2_mkdir(struct inode *dir, static int ocfs2_create(struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { int ret; @@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir, handle_t *handle = NULL; struct buffer_head *old_dir_bh = NULL; struct buffer_head *new_dir_bh = NULL; - nlink_t old_dir_nlink = old_dir->i_nlink; + u32 old_dir_nlink = old_dir->i_nlink; struct ocfs2_dinode *old_di; struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index a5ebe421195..286edf1e231 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) goto out; } - rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), - &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); + rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, + NULL, NULL, NULL, &fsdlm); if (rc) { ocfs2_live_connection_drop(control); goto out; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4994f8b0e60..604e12c4e97 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -108,7 +108,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *options, int is_remount); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); -static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); +static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); static int ocfs2_remount(struct super_block *sb, int *flags, char *data); @@ -569,7 +569,6 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) static void ocfs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } @@ -1534,9 +1533,9 @@ bail: return status; } -static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +static int ocfs2_show_options(struct seq_file *s, struct dentry *root) { - struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); + struct ocfs2_super *osb = OCFS2_SB(root->d_sb); unsigned long opts = osb->s_mount_opt; unsigned int local_alloc_megs; @@ -1568,8 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->preferred_slot != OCFS2_INVALID_SLOT) seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); - if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME)) - seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); if (osb->osb_commit_interval) seq_printf(s, ",commit=%u", diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index aa9e8777b09..0ba9ea1e796 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -623,7 +623,7 @@ int ocfs2_calc_security_init(struct inode *dir, int ocfs2_calc_xattr_init(struct inode *dir, struct buffer_head *dir_bh, - int mode, + umode_t mode, struct ocfs2_security_xattr_info *si, int *want_clusters, int *xattr_credits, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index d63cfb72316..e5c7f15465b 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *, struct ocfs2_security_xattr_info *, int *, int *, struct ocfs2_alloc_context **); int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, - int, struct ocfs2_security_xattr_info *, + umode_t, struct ocfs2_security_xattr_info *, int *, int *, int *); /* diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 98e54427439..f00576ec320 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -255,7 +255,7 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry) return 0; } -static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) +static int omfs_add_node(struct inode *dir, struct dentry *dentry, umode_t mode) { int err; struct inode *inode = omfs_new_inode(dir, mode); @@ -279,12 +279,12 @@ out_free_inode: return err; } -static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return omfs_add_node(dir, dentry, mode | S_IFREG); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index e043c4cb9a9..6065bb0ba20 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -28,7 +28,7 @@ struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) return sb_bread(sb, clus_to_blk(sbi, block)); } -struct inode *omfs_new_inode(struct inode *dir, int mode) +struct inode *omfs_new_inode(struct inode *dir, umode_t mode) { struct inode *inode; u64 new_block; diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h index 7d414fef501..8941f12c6b0 100644 --- a/fs/omfs/omfs.h +++ b/fs/omfs/omfs.h @@ -60,7 +60,7 @@ extern int omfs_shrink_inode(struct inode *inode); /* inode.c */ extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block); extern struct inode *omfs_iget(struct super_block *sb, ino_t inode); -extern struct inode *omfs_new_inode(struct inode *dir, int mode); +extern struct inode *omfs_new_inode(struct inode *dir, umode_t mode); extern int omfs_reserve_block(struct super_block *sb, sector_t block); extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino); extern int omfs_sync_inode(struct inode *inode); diff --git a/fs/open.c b/fs/open.c index 22c41b543f2..77becc04114 100644 --- a/fs/open.c +++ b/fs/open.c @@ -456,7 +456,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; mutex_lock(&inode->i_mutex); - error = security_path_chmod(path->dentry, path->mnt, mode); + error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); @@ -468,7 +468,7 @@ out_unlock: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { struct file * file; int err = -EBADF; @@ -482,7 +482,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { struct path path; int error; @@ -495,7 +495,7 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) return error; } -SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return sys_fchmodat(AT_FDCWD, filename, mode); } @@ -608,7 +608,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) dentry = file->f_path.dentry; audit_inode(NULL, dentry); error = chown_common(&file->f_path, user, group); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); out_fput: fput(file); out: @@ -877,7 +877,7 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); -static inline int build_open_flags(int flags, int mode, struct open_flags *op) +static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; int acc_mode; @@ -948,7 +948,7 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op) * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *filp_open(const char *filename, int flags, umode_t mode) { struct open_flags op; int lookup = build_open_flags(flags, mode, &op); @@ -970,7 +970,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, int mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; int lookup = build_open_flags(flags, mode, &op); @@ -994,7 +994,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { long ret; @@ -1008,7 +1008,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, - int, mode) + umode_t, mode) { long ret; @@ -1027,7 +1027,7 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ -SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index e4e0ff7962e..a88c03bc749 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -346,7 +346,6 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) static void openprom_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(op_inode_cachep, OP_I(inode)); } diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig deleted file mode 100644 index cb5f0a3f1b0..00000000000 --- a/fs/partitions/Kconfig +++ /dev/null @@ -1,251 +0,0 @@ -# -# Partition configuration -# -config PARTITION_ADVANCED - bool "Advanced partition selection" - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under an operating system running on a different - architecture than your Linux system. - - Note that the answer to this question won't directly affect the - kernel: saying N will just cause the configurator to skip all - the questions about foreign partitioning schemes. - - If unsure, say N. - -config ACORN_PARTITION - bool "Acorn partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - help - Support hard disks partitioned under Acorn operating systems. - -config ACORN_PARTITION_CUMANA - bool "Cumana partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using the Cumana interface on Acorn machines. - -config ACORN_PARTITION_EESOX - bool "EESOX partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - -config ACORN_PARTITION_ICS - bool "ICS partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using the ICS interface on Acorn machines. - -config ACORN_PARTITION_ADFS - bool "Native filecore partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - The Acorn Disc Filing System is the standard file system of the - RiscOS operating system which runs on Acorn's ARM-based Risc PC - systems and the Acorn Archimedes range of machines. If you say - `Y' here, Linux will support disk partitions created under ADFS. - -config ACORN_PARTITION_POWERTEC - bool "PowerTec partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Support reading partition tables created on Acorn machines using - the PowerTec SCSI drive. - -config ACORN_PARTITION_RISCIX - bool "RISCiX partition support" if PARTITION_ADVANCED - default y if ARCH_ACORN - depends on ACORN_PARTITION - help - Once upon a time, there was a native Unix port for the Acorn series - of machines called RISCiX. If you say 'Y' here, Linux will be able - to read disks partitioned under RISCiX. - -config OSF_PARTITION - bool "Alpha OSF partition support" if PARTITION_ADVANCED - default y if ALPHA - help - Say Y here if you would like to use hard disks under Linux which - were partitioned on an Alpha machine. - -config AMIGA_PARTITION - bool "Amiga partition table support" if PARTITION_ADVANCED - default y if (AMIGA || AFFS_FS=y) - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under AmigaOS. - -config ATARI_PARTITION - bool "Atari partition table support" if PARTITION_ADVANCED - default y if ATARI - help - Say Y here if you would like to use hard disks under Linux which - were partitioned under the Atari OS. - -config IBM_PARTITION - bool "IBM disk label and partition support" - depends on PARTITION_ADVANCED && S390 - help - Say Y here if you would like to be able to read the hard disk - partition table format used by IBM DASD disks operating under CMS. - Otherwise, say N. - -config MAC_PARTITION - bool "Macintosh partition map support" if PARTITION_ADVANCED - default y if (MAC || PPC_PMAC) - help - Say Y here if you would like to use hard disks under Linux which - were partitioned on a Macintosh. - -config MSDOS_PARTITION - bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED - default y - help - Say Y here. - -config BSD_DISKLABEL - bool "BSD disklabel (FreeBSD partition tables) support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - FreeBSD uses its own hard disk partition scheme on your PC. It - requires only one entry in the primary partition table of your disk - and manages it similarly to DOS extended partitions, putting in its - first sector a new partition table in BSD disklabel format. Saying Y - here allows you to read these disklabels and further mount FreeBSD - partitions from within Linux if you have also said Y to "UFS - file system support", above. If you don't know what all this is - about, say N. - -config MINIX_SUBPARTITION - bool "Minix subpartition support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - Minix 2.0.0/2.0.2 subpartition table support for Linux. - Say Y here if you want to mount and use Minix 2.0.0/2.0.2 - subpartitions. - -config SOLARIS_X86_PARTITION - bool "Solaris (x86) partition table support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - help - Like most systems, Solaris x86 uses its own hard disk partition - table format, incompatible with all others. Saying Y here allows you - to read these partition tables and further mount Solaris x86 - partitions from within Linux if you have also said Y to "UFS - file system support", above. - -config UNIXWARE_DISKLABEL - bool "Unixware slices support" - depends on PARTITION_ADVANCED && MSDOS_PARTITION - ---help--- - Like some systems, UnixWare uses its own slice table inside a - partition (VTOC - Virtual Table of Contents). Its format is - incompatible with all other OSes. Saying Y here allows you to read - VTOC and further mount UnixWare partitions read-only from within - Linux if you have also said Y to "UFS file system support" or - "System V and Coherent file system support", above. - - This is mainly used to carry data from a UnixWare box to your - Linux box via a removable medium like magneto-optical, ZIP or - removable IDE drives. Note, however, that a good portable way to - transport files and directories between unixes (and even other - operating systems) is given by the tar program ("man tar" or - preferably "info tar"). - - If you don't know what all this is about, say N. - -config LDM_PARTITION - bool "Windows Logical Disk Manager (Dynamic Disk) support" - depends on PARTITION_ADVANCED - ---help--- - Say Y here if you would like to use hard disks under Linux which - were partitioned using Windows 2000's/XP's or Vista's Logical Disk - Manager. They are also known as "Dynamic Disks". - - Note this driver only supports Dynamic Disks with a protective MBR - label, i.e. DOS partition table. It does not support GPT labelled - Dynamic Disks yet as can be created with Vista. - - Windows 2000 introduced the concept of Dynamic Disks to get around - the limitations of the PC's partitioning scheme. The Logical Disk - Manager allows the user to repartition a disk and create spanned, - mirrored, striped or RAID volumes, all without the need for - rebooting. - - Normal partitions are now called Basic Disks under Windows 2000, XP, - and Vista. - - For a fuller description read <file:Documentation/ldm.txt>. - - If unsure, say N. - -config LDM_DEBUG - bool "Windows LDM extra logging" - depends on LDM_PARTITION - help - Say Y here if you would like LDM to log verbosely. This could be - helpful if the driver doesn't work as expected and you'd like to - report a bug. - - If unsure, say N. - -config SGI_PARTITION - bool "SGI partition support" if PARTITION_ADVANCED - default y if DEFAULT_SGI_PARTITION - help - Say Y here if you would like to be able to read the hard disk - partition table format used by SGI machines. - -config ULTRIX_PARTITION - bool "Ultrix partition table support" if PARTITION_ADVANCED - default y if MACH_DECSTATION - help - Say Y here if you would like to be able to read the hard disk - partition table format used by DEC (now Compaq) Ultrix machines. - Otherwise, say N. - -config SUN_PARTITION - bool "Sun partition tables support" if PARTITION_ADVANCED - default y if (SPARC || SUN3 || SUN3X) - ---help--- - Like most systems, SunOS uses its own hard disk partition table - format, incompatible with all others. Saying Y here allows you to - read these partition tables and further mount SunOS partitions from - within Linux if you have also said Y to "UFS file system support", - above. This is mainly used to carry data from a SPARC under SunOS to - your Linux box via a removable medium like magneto-optical or ZIP - drives; note however that a good portable way to transport files and - directories between unixes (and even other operating systems) is - given by the tar program ("man tar" or preferably "info tar"). If - you don't know what all this is about, say N. - -config KARMA_PARTITION - bool "Karma Partition support" - depends on PARTITION_ADVANCED - help - Say Y here if you would like to mount the Rio Karma MP3 player, as it - uses a proprietary partition table. - -config EFI_PARTITION - bool "EFI GUID Partition support" - depends on PARTITION_ADVANCED - select CRC32 - help - Say Y here if you would like to use hard disks under Linux which - were partitioned using EFI GPT. - -config SYSV68_PARTITION - bool "SYSV68 partition table support" if PARTITION_ADVANCED - default y if VME - help - Say Y here if you would like to be able to read the hard disk - partition table format used by Motorola Delta machines (using - sysv68). - Otherwise, say N. diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile deleted file mode 100644 index 03af8eac51d..00000000000 --- a/fs/partitions/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-$(CONFIG_BLOCK) := check.o - -obj-$(CONFIG_ACORN_PARTITION) += acorn.o -obj-$(CONFIG_AMIGA_PARTITION) += amiga.o -obj-$(CONFIG_ATARI_PARTITION) += atari.o -obj-$(CONFIG_MAC_PARTITION) += mac.o -obj-$(CONFIG_LDM_PARTITION) += ldm.o -obj-$(CONFIG_MSDOS_PARTITION) += msdos.o -obj-$(CONFIG_OSF_PARTITION) += osf.o -obj-$(CONFIG_SGI_PARTITION) += sgi.o -obj-$(CONFIG_SUN_PARTITION) += sun.o -obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o -obj-$(CONFIG_IBM_PARTITION) += ibm.o -obj-$(CONFIG_EFI_PARTITION) += efi.o -obj-$(CONFIG_KARMA_PARTITION) += karma.o -obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c deleted file mode 100644 index fbeb697374d..00000000000 --- a/fs/partitions/acorn.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - * linux/fs/partitions/acorn.c - * - * Copyright (c) 1996-2000 Russell King. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Scan ADFS partitions on hard disk drives. Unfortunately, there - * isn't a standard for partitioning drives on Acorn machines, so - * every single manufacturer of SCSI and IDE cards created their own - * method. - */ -#include <linux/buffer_head.h> -#include <linux/adfs_fs.h> - -#include "check.h" -#include "acorn.h" - -/* - * Partition types. (Oh for reusability) - */ -#define PARTITION_RISCIX_MFM 1 -#define PARTITION_RISCIX_SCSI 2 -#define PARTITION_LINUX 9 - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static struct adfs_discrecord * -adfs_partition(struct parsed_partitions *state, char *name, char *data, - unsigned long first_sector, int slot) -{ - struct adfs_discrecord *dr; - unsigned int nr_sects; - - if (adfs_checkbblk(data)) - return NULL; - - dr = (struct adfs_discrecord *)(data + 0x1c0); - - if (dr->disc_size == 0 && dr->disc_size_high == 0) - return NULL; - - nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | - (le32_to_cpu(dr->disc_size) >> 9); - - if (name) { - strlcat(state->pp_buf, " [", PAGE_SIZE); - strlcat(state->pp_buf, name, PAGE_SIZE); - strlcat(state->pp_buf, "]", PAGE_SIZE); - } - put_partition(state, slot, first_sector, nr_sects); - return dr; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_RISCIX - -struct riscix_part { - __le32 start; - __le32 length; - __le32 one; - char name[16]; -}; - -struct riscix_record { - __le32 magic; -#define RISCIX_MAGIC cpu_to_le32(0x4a657320) - __le32 date; - struct riscix_part part[8]; -}; - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static int riscix_partition(struct parsed_partitions *state, - unsigned long first_sect, int slot, - unsigned long nr_sects) -{ - Sector sect; - struct riscix_record *rr; - - rr = read_part_sector(state, first_sect, §); - if (!rr) - return -1; - - strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); - - - if (rr->magic == RISCIX_MAGIC) { - unsigned long size = nr_sects > 2 ? 2 : nr_sects; - int part; - - strlcat(state->pp_buf, " <", PAGE_SIZE); - - put_partition(state, slot++, first_sect, size); - for (part = 0; part < 8; part++) { - if (rr->part[part].one && - memcmp(rr->part[part].name, "All\0", 4)) { - put_partition(state, slot++, - le32_to_cpu(rr->part[part].start), - le32_to_cpu(rr->part[part].length)); - strlcat(state->pp_buf, "(", PAGE_SIZE); - strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); - strlcat(state->pp_buf, ")", PAGE_SIZE); - } - } - - strlcat(state->pp_buf, " >\n", PAGE_SIZE); - } else { - put_partition(state, slot++, first_sect, nr_sects); - } - - put_dev_sector(sect); - return slot; -} -#endif -#endif - -#define LINUX_NATIVE_MAGIC 0xdeafa1de -#define LINUX_SWAP_MAGIC 0xdeafab1e - -struct linux_part { - __le32 magic; - __le32 start_sect; - __le32 nr_sects; -}; - -#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ - defined(CONFIG_ACORN_PARTITION_ADFS) -static int linux_partition(struct parsed_partitions *state, - unsigned long first_sect, int slot, - unsigned long nr_sects) -{ - Sector sect; - struct linux_part *linuxp; - unsigned long size = nr_sects > 2 ? 2 : nr_sects; - - strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); - - put_partition(state, slot++, first_sect, size); - - linuxp = read_part_sector(state, first_sect, §); - if (!linuxp) - return -1; - - strlcat(state->pp_buf, " <", PAGE_SIZE); - while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || - linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { - if (slot == state->limit) - break; - put_partition(state, slot++, first_sect + - le32_to_cpu(linuxp->start_sect), - le32_to_cpu(linuxp->nr_sects)); - linuxp ++; - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - - put_dev_sector(sect); - return slot; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_CUMANA -int adfspart_check_CUMANA(struct parsed_partitions *state) -{ - unsigned long first_sector = 0; - unsigned int start_blk = 0; - Sector sect; - unsigned char *data; - char *name = "CUMANA/ADFS"; - int first = 1; - int slot = 1; - - /* - * Try Cumana style partitions - sector 6 contains ADFS boot block - * with pointer to next 'drive'. - * - * There are unknowns in this code - is the 'cylinder number' of the - * next partition relative to the start of this one - I'm assuming - * it is. - * - * Also, which ID did Cumana use? - * - * This is totally unfinished, and will require more work to get it - * going. Hence it is totally untested. - */ - do { - struct adfs_discrecord *dr; - unsigned int nr_sects; - - data = read_part_sector(state, start_blk * 2 + 6, §); - if (!data) - return -1; - - if (slot == state->limit) - break; - - dr = adfs_partition(state, name, data, first_sector, slot++); - if (!dr) - break; - - name = NULL; - - nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * - (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * - dr->secspertrack; - - if (!nr_sects) - break; - - first = 0; - first_sector += nr_sects; - start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); - nr_sects = 0; /* hmm - should be partition size */ - - switch (data[0x1fc] & 15) { - case 0: /* No partition / ADFS? */ - break; - -#ifdef CONFIG_ACORN_PARTITION_RISCIX - case PARTITION_RISCIX_SCSI: - /* RISCiX - we don't know how to find the next one. */ - slot = riscix_partition(state, first_sector, slot, - nr_sects); - break; -#endif - - case PARTITION_LINUX: - slot = linux_partition(state, first_sector, slot, - nr_sects); - break; - } - put_dev_sector(sect); - if (slot == -1) - return -1; - } while (1); - put_dev_sector(sect); - return first ? 0 : 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_ADFS -/* - * Purpose: allocate ADFS partitions. - * - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * - * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. - * - * Alloc : hda = whole drive - * hda1 = ADFS partition on first drive. - * hda2 = non-ADFS partition. - */ -int adfspart_check_ADFS(struct parsed_partitions *state) -{ - unsigned long start_sect, nr_sects, sectscyl, heads; - Sector sect; - unsigned char *data; - struct adfs_discrecord *dr; - unsigned char id; - int slot = 1; - - data = read_part_sector(state, 6, §); - if (!data) - return -1; - - dr = adfs_partition(state, "ADFS", data, 0, slot++); - if (!dr) { - put_dev_sector(sect); - return 0; - } - - heads = dr->heads + ((dr->lowsector >> 6) & 1); - sectscyl = dr->secspertrack * heads; - start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; - id = data[0x1fc] & 15; - put_dev_sector(sect); - - /* - * Work out start of non-adfs partition. - */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; - - if (start_sect) { - switch (id) { -#ifdef CONFIG_ACORN_PARTITION_RISCIX - case PARTITION_RISCIX_SCSI: - case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, - nr_sects); - break; -#endif - - case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, - nr_sects); - break; - } - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_ICS - -struct ics_part { - __le32 start; - __le32 size; -}; - -static int adfspart_check_ICSLinux(struct parsed_partitions *state, - unsigned long block) -{ - Sector sect; - unsigned char *data = read_part_sector(state, block, §); - int result = 0; - - if (data) { - if (memcmp(data, "LinuxPart", 9) == 0) - result = 1; - put_dev_sector(sect); - } - - return result; -} - -/* - * Check for a valid ICS partition using the checksum. - */ -static inline int valid_ics_sector(const unsigned char *data) -{ - unsigned long sum; - int i; - - for (i = 0, sum = 0x50617274; i < 508; i++) - sum += data[i]; - - sum -= le32_to_cpu(*(__le32 *)(&data[508])); - - return sum == 0; -} - -/* - * Purpose: allocate ICS partitions. - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. - * Alloc : hda = whole drive - * hda1 = ADFS partition 0 on first drive. - * hda2 = ADFS partition 1 on first drive. - * ..etc.. - */ -int adfspart_check_ICS(struct parsed_partitions *state) -{ - const unsigned char *data; - const struct ics_part *p; - int slot; - Sector sect; - - /* - * Try ICS style partitions - sector 0 contains partition info. - */ - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - if (!valid_ics_sector(data)) { - put_dev_sector(sect); - return 0; - } - - strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); - - for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { - u32 start = le32_to_cpu(p->start); - s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ - - if (slot == state->limit) - break; - - /* - * Negative sizes tell the RISC OS ICS driver to ignore - * this partition - in effect it says that this does not - * contain an ADFS filesystem. - */ - if (size < 0) { - size = -size; - - /* - * Our own extension - We use the first sector - * of the partition to identify what type this - * partition is. We must not make this visible - * to the filesystem. - */ - if (size > 1 && adfspart_check_ICSLinux(state, start)) { - start += 1; - size -= 1; - } - } - - if (size) - put_partition(state, slot++, start, size); - } - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_POWERTEC -struct ptec_part { - __le32 unused1; - __le32 unused2; - __le32 start; - __le32 size; - __le32 unused5; - char type[8]; -}; - -static inline int valid_ptec_sector(const unsigned char *data) -{ - unsigned char checksum = 0x2a; - int i; - - /* - * If it looks like a PC/BIOS partition, then it - * probably isn't PowerTec. - */ - if (data[510] == 0x55 && data[511] == 0xaa) - return 0; - - for (i = 0; i < 511; i++) - checksum += data[i]; - - return checksum == data[511]; -} - -/* - * Purpose: allocate ICS partitions. - * Params : hd - pointer to gendisk structure to store partition info. - * dev - device number to access. - * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. - * Alloc : hda = whole drive - * hda1 = ADFS partition 0 on first drive. - * hda2 = ADFS partition 1 on first drive. - * ..etc.. - */ -int adfspart_check_POWERTEC(struct parsed_partitions *state) -{ - Sector sect; - const unsigned char *data; - const struct ptec_part *p; - int slot = 1; - int i; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - if (!valid_ptec_sector(data)) { - put_dev_sector(sect); - return 0; - } - - strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); - - for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { - u32 start = le32_to_cpu(p->start); - u32 size = le32_to_cpu(p->size); - - if (size) - put_partition(state, slot++, start, size); - } - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} -#endif - -#ifdef CONFIG_ACORN_PARTITION_EESOX -struct eesox_part { - char magic[6]; - char name[10]; - __le32 start; - __le32 unused6; - __le32 unused7; - __le32 unused8; -}; - -/* - * Guess who created this format? - */ -static const char eesox_name[] = { - 'N', 'e', 'i', 'l', ' ', - 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' -}; - -/* - * EESOX SCSI partition format. - * - * This is a goddamned awful partition format. We don't seem to store - * the size of the partition in this table, only the start addresses. - * - * There are two possibilities where the size comes from: - * 1. The individual ADFS boot block entries that are placed on the disk. - * 2. The start address of the next entry. - */ -int adfspart_check_EESOX(struct parsed_partitions *state) -{ - Sector sect; - const unsigned char *data; - unsigned char buffer[256]; - struct eesox_part *p; - sector_t start = 0; - int i, slot = 1; - - data = read_part_sector(state, 7, §); - if (!data) - return -1; - - /* - * "Decrypt" the partition table. God knows why... - */ - for (i = 0; i < 256; i++) - buffer[i] = data[i] ^ eesox_name[i & 15]; - - put_dev_sector(sect); - - for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { - sector_t next; - - if (memcmp(p->magic, "Eesox", 6)) - break; - - next = le32_to_cpu(p->start); - if (i) - put_partition(state, slot++, start, next - start); - start = next; - } - - if (i != 0) { - sector_t size; - - size = get_capacity(state->bdev->bd_disk); - put_partition(state, slot++, start, size - start); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - } - - return i ? 1 : 0; -} -#endif diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h deleted file mode 100644 index ede82852969..00000000000 --- a/fs/partitions/acorn.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * linux/fs/partitions/acorn.h - * - * Copyright (C) 1996-2001 Russell King. - * - * I _hate_ this partitioning mess - why can't we have one defined - * format, and everyone stick to it? - */ - -int adfspart_check_CUMANA(struct parsed_partitions *state); -int adfspart_check_ADFS(struct parsed_partitions *state); -int adfspart_check_ICS(struct parsed_partitions *state); -int adfspart_check_POWERTEC(struct parsed_partitions *state); -int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c deleted file mode 100644 index 70cbf44a156..00000000000 --- a/fs/partitions/amiga.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * fs/partitions/amiga.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/types.h> -#include <linux/affs_hardblocks.h> - -#include "check.h" -#include "amiga.h" - -static __inline__ u32 -checksum_block(__be32 *m, int size) -{ - u32 sum = 0; - - while (size--) - sum += be32_to_cpu(*m++); - return sum; -} - -int amiga_partition(struct parsed_partitions *state) -{ - Sector sect; - unsigned char *data; - struct RigidDiskBlock *rdb; - struct PartitionBlock *pb; - int start_sect, nr_sects, blk, part, res = 0; - int blksize = 1; /* Multiplier for disk block size */ - int slot = 1; - char b[BDEVNAME_SIZE]; - - for (blk = 0; ; blk++, put_dev_sector(sect)) { - if (blk == RDB_ALLOCATION_LIMIT) - goto rdb_done; - data = read_part_sector(state, blk, §); - if (!data) { - if (warn_no_part) - printk("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); - res = -1; - goto rdb_done; - } - if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) - continue; - - rdb = (struct RigidDiskBlock *)data; - if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) - break; - /* Try again with 0xdc..0xdf zeroed, Windows might have - * trashed it. - */ - *(__be32 *)(data+0xdc) = 0; - if (checksum_block((__be32 *)data, - be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - printk("Warning: Trashed word at 0xd0 in block %d " - "ignored in checksum calculation\n",blk); - break; - } - - printk("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); - } - - /* blksize is blocks per 512 byte standard block */ - blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; - - { - char tmp[7 + 10 + 1 + 1]; - - /* Be more informative */ - snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - blk = be32_to_cpu(rdb->rdb_PartitionList); - put_dev_sector(sect); - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { - blk *= blksize; /* Read in terms partition table understands */ - data = read_part_sector(state, blk, §); - if (!data) { - if (warn_no_part) - printk("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); - res = -1; - goto rdb_done; - } - pb = (struct PartitionBlock *)data; - blk = be32_to_cpu(pb->pb_Next); - if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) - continue; - if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) - continue; - - /* Tell Kernel about it */ - - nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; - if (!nr_sects) - continue; - start_sect = be32_to_cpu(pb->pb_Environment[9]) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; - put_partition(state,slot++,start_sect,nr_sects); - { - /* Be even more informative to aid mounting */ - char dostype[4]; - char tmp[42]; - - __be32 *dt = (__be32 *)dostype; - *dt = pb->pb_Environment[16]; - if (dostype[3] < ' ') - snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", - dostype[0], dostype[1], - dostype[2], dostype[3] + '@' ); - else - snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", - dostype[0], dostype[1], - dostype[2], dostype[3]); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - snprintf(tmp, sizeof(tmp), "(res %d spb %d)", - be32_to_cpu(pb->pb_Environment[6]), - be32_to_cpu(pb->pb_Environment[4])); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - res = 1; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - -rdb_done: - return res; -} diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h deleted file mode 100644 index d094585cada..00000000000 --- a/fs/partitions/amiga.h +++ /dev/null @@ -1,6 +0,0 @@ -/* - * fs/partitions/amiga.h - */ - -int amiga_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c deleted file mode 100644 index 9875b05e80a..00000000000 --- a/fs/partitions/atari.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * fs/partitions/atari.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/ctype.h> -#include "check.h" -#include "atari.h" - -/* ++guenther: this should be settable by the user ("make config")?. - */ -#define ICD_PARTS - -/* check if a partition entry looks valid -- Atari format is assumed if at - least one of the primary entries is ok this way */ -#define VALID_PARTITION(pi,hdsiz) \ - (((pi)->flg & 1) && \ - isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ - be32_to_cpu((pi)->st) <= (hdsiz) && \ - be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) - -static inline int OK_id(char *s) -{ - return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || - memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || - memcmp (s, "RAW", 3) == 0 ; -} - -int atari_partition(struct parsed_partitions *state) -{ - Sector sect; - struct rootsector *rs; - struct partition_info *pi; - u32 extensect; - u32 hd_size; - int slot; -#ifdef ICD_PARTS - int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ -#endif - - rs = read_part_sector(state, 0, §); - if (!rs) - return -1; - - /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; - if (!VALID_PARTITION(&rs->part[0], hd_size) && - !VALID_PARTITION(&rs->part[1], hd_size) && - !VALID_PARTITION(&rs->part[2], hd_size) && - !VALID_PARTITION(&rs->part[3], hd_size)) { - /* - * if there's no valid primary partition, assume that no Atari - * format partition table (there's no reliable magic or the like - * :-() - */ - put_dev_sector(sect); - return 0; - } - - pi = &rs->part[0]; - strlcat(state->pp_buf, " AHDI", PAGE_SIZE); - for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { - struct rootsector *xrs; - Sector sect2; - ulong partsect; - - if ( !(pi->flg & 1) ) - continue; - /* active partition */ - if (memcmp (pi->id, "XGM", 3) != 0) { - /* we don't care about other id's */ - put_partition (state, slot, be32_to_cpu(pi->st), - be32_to_cpu(pi->siz)); - continue; - } - /* extension partition */ -#ifdef ICD_PARTS - part_fmt = 1; -#endif - strlcat(state->pp_buf, " XGM<", PAGE_SIZE); - partsect = extensect = be32_to_cpu(pi->st); - while (1) { - xrs = read_part_sector(state, partsect, §2); - if (!xrs) { - printk (" block %ld read failed\n", partsect); - put_dev_sector(sect); - return -1; - } - - /* ++roman: sanity check: bit 0 of flg field must be set */ - if (!(xrs->part[0].flg & 1)) { - printk( "\nFirst sub-partition in extended partition is not valid!\n" ); - put_dev_sector(sect2); - break; - } - - put_partition(state, slot, - partsect + be32_to_cpu(xrs->part[0].st), - be32_to_cpu(xrs->part[0].siz)); - - if (!(xrs->part[1].flg & 1)) { - /* end of linked partition list */ - put_dev_sector(sect2); - break; - } - if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { - printk("\nID of extended partition is not XGM!\n"); - put_dev_sector(sect2); - break; - } - - partsect = be32_to_cpu(xrs->part[1].st) + extensect; - put_dev_sector(sect2); - if (++slot == state->limit) { - printk( "\nMaximum number of partitions reached!\n" ); - break; - } - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - } -#ifdef ICD_PARTS - if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ - pi = &rs->icdpart[0]; - /* sanity check: no ICD format if first partition invalid */ - if (OK_id(pi->id)) { - strlcat(state->pp_buf, " ICD<", PAGE_SIZE); - for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { - /* accept only GEM,BGM,RAW,LNX,SWP partitions */ - if (!((pi->flg & 1) && OK_id(pi->id))) - continue; - part_fmt = 2; - put_partition (state, slot, - be32_to_cpu(pi->st), - be32_to_cpu(pi->siz)); - } - strlcat(state->pp_buf, " >", PAGE_SIZE); - } - } -#endif - put_dev_sector(sect); - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - - return 1; -} diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h deleted file mode 100644 index fe2d32a89f3..00000000000 --- a/fs/partitions/atari.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * fs/partitions/atari.h - * Moved by Russell King from: - * - * linux/include/linux/atari_rootsec.h - * definitions for Atari Rootsector layout - * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) - * - * modified for ICD/Supra partitioning scheme restricted to at most 12 - * partitions - * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) - */ - -struct partition_info -{ - u8 flg; /* bit 0: active; bit 7: bootable */ - char id[3]; /* "GEM", "BGM", "XGM", or other */ - __be32 st; /* start of partition */ - __be32 siz; /* length of partition */ -}; - -struct rootsector -{ - char unused[0x156]; /* room for boot code */ - struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ - char unused2[0xc]; - u32 hd_siz; /* size of disk in blocks */ - struct partition_info part[4]; - u32 bsl_st; /* start of bad sector list */ - u32 bsl_cnt; /* length of bad sector list */ - u16 checksum; /* checksum for bootable disks */ -} __attribute__((__packed__)); - -int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c deleted file mode 100644 index e3c63d1c5e1..00000000000 --- a/fs/partitions/check.c +++ /dev/null @@ -1,687 +0,0 @@ -/* - * fs/partitions/check.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - * - * We now have independent partition support from the - * block drivers, which allows all the partition code to - * be grouped in one location, and it to be mostly self - * contained. - * - * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/ctype.h> -#include <linux/genhd.h> -#include <linux/blktrace_api.h> - -#include "check.h" - -#include "acorn.h" -#include "amiga.h" -#include "atari.h" -#include "ldm.h" -#include "mac.h" -#include "msdos.h" -#include "osf.h" -#include "sgi.h" -#include "sun.h" -#include "ibm.h" -#include "ultrix.h" -#include "efi.h" -#include "karma.h" -#include "sysv68.h" - -#ifdef CONFIG_BLK_DEV_MD -extern void md_autodetect_dev(dev_t dev); -#endif - -int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ - -static int (*check_part[])(struct parsed_partitions *) = { - /* - * Probe partition formats with tables at disk address 0 - * that also have an ADFS boot block at 0xdc0. - */ -#ifdef CONFIG_ACORN_PARTITION_ICS - adfspart_check_ICS, -#endif -#ifdef CONFIG_ACORN_PARTITION_POWERTEC - adfspart_check_POWERTEC, -#endif -#ifdef CONFIG_ACORN_PARTITION_EESOX - adfspart_check_EESOX, -#endif - - /* - * Now move on to formats that only have partition info at - * disk address 0xdc0. Since these may also have stale - * PC/BIOS partition tables, they need to come before - * the msdos entry. - */ -#ifdef CONFIG_ACORN_PARTITION_CUMANA - adfspart_check_CUMANA, -#endif -#ifdef CONFIG_ACORN_PARTITION_ADFS - adfspart_check_ADFS, -#endif - -#ifdef CONFIG_EFI_PARTITION - efi_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_SGI_PARTITION - sgi_partition, -#endif -#ifdef CONFIG_LDM_PARTITION - ldm_partition, /* this must come before msdos */ -#endif -#ifdef CONFIG_MSDOS_PARTITION - msdos_partition, -#endif -#ifdef CONFIG_OSF_PARTITION - osf_partition, -#endif -#ifdef CONFIG_SUN_PARTITION - sun_partition, -#endif -#ifdef CONFIG_AMIGA_PARTITION - amiga_partition, -#endif -#ifdef CONFIG_ATARI_PARTITION - atari_partition, -#endif -#ifdef CONFIG_MAC_PARTITION - mac_partition, -#endif -#ifdef CONFIG_ULTRIX_PARTITION - ultrix_partition, -#endif -#ifdef CONFIG_IBM_PARTITION - ibm_partition, -#endif -#ifdef CONFIG_KARMA_PARTITION - karma_partition, -#endif -#ifdef CONFIG_SYSV68_PARTITION - sysv68_partition, -#endif - NULL -}; - -/* - * disk_name() is used by partition check code and the genhd driver. - * It formats the devicename of the indicated disk into - * the supplied buffer (of size at least 32), and returns - * a pointer to that same buffer (for convenience). - */ - -char *disk_name(struct gendisk *hd, int partno, char *buf) -{ - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); - - return buf; -} - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); -} - -EXPORT_SYMBOL(bdevname); - -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - -static struct parsed_partitions * -check_partition(struct gendisk *hd, struct block_device *bdev) -{ - struct parsed_partitions *state; - int i, res, err; - - state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); - if (!state) - return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { - kfree(state); - return NULL; - } - state->pp_buf[0] = '\0'; - - state->bdev = bdev; - disk_name(hd, 0, state->name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); - if (isdigit(state->name[strlen(state->name)-1])) - sprintf(state->name, "p"); - - state->limit = disk_max_parts(hd); - i = res = err = 0; - while (!res && check_part[i]) { - memset(&state->parts, 0, sizeof(state->parts)); - res = check_part[i++](state); - if (res < 0) { - /* We have hit an I/O error which we don't report now. - * But record it, and let the others do their job. - */ - err = res; - res = 0; - } - - } - if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - return state; - } - if (state->access_beyond_eod) - err = -ENOSPC; - if (err) - /* The partition is unrecognized. So report I/O errors if there were any */ - res = err; - if (!res) - strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); - else if (warn_no_part) - strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); - - printk(KERN_INFO "%s", state->pp_buf); - - free_page((unsigned long)state->pp_buf); - kfree(state); - return ERR_PTR(res); -} - -static ssize_t part_partition_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->partno); -} - -static ssize_t part_start_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); -} - -ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); -} - -static ssize_t part_ro_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); -} - -static ssize_t part_alignment_offset_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); -} - -static ssize_t part_discard_alignment_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); -} - -ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - int cpu; - - cpu = part_stat_lock(); - part_round_stats(cpu, p); - part_stat_unlock(); - return sprintf(buf, - "%8lu %8lu %8llu %8u " - "%8lu %8lu %8llu %8u " - "%8u %8u %8u" - "\n", - part_stat_read(p, ios[READ]), - part_stat_read(p, merges[READ]), - (unsigned long long)part_stat_read(p, sectors[READ]), - jiffies_to_msecs(part_stat_read(p, ticks[READ])), - part_stat_read(p, ios[WRITE]), - part_stat_read(p, merges[WRITE]), - (unsigned long long)part_stat_read(p, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), - jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); -} - -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), - atomic_read(&p->in_flight[1])); -} - -#ifdef CONFIG_FAIL_MAKE_REQUEST -ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); -} - -ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct hd_struct *p = dev_to_part(dev); - int i; - - if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; - - return count; -} -#endif - -static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); -static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); -static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); -static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); -static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); -static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, - NULL); -static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); -static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); -#ifdef CONFIG_FAIL_MAKE_REQUEST -static struct device_attribute dev_attr_fail = - __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); -#endif - -static struct attribute *part_attrs[] = { - &dev_attr_partition.attr, - &dev_attr_start.attr, - &dev_attr_size.attr, - &dev_attr_ro.attr, - &dev_attr_alignment_offset.attr, - &dev_attr_discard_alignment.attr, - &dev_attr_stat.attr, - &dev_attr_inflight.attr, -#ifdef CONFIG_FAIL_MAKE_REQUEST - &dev_attr_fail.attr, -#endif - NULL -}; - -static struct attribute_group part_attr_group = { - .attrs = part_attrs, -}; - -static const struct attribute_group *part_attr_groups[] = { - &part_attr_group, -#ifdef CONFIG_BLK_DEV_IO_TRACE - &blk_trace_attr_group, -#endif - NULL -}; - -static void part_release(struct device *dev) -{ - struct hd_struct *p = dev_to_part(dev); - free_part_stats(p); - free_part_info(p); - kfree(p); -} - -struct device_type part_type = { - .name = "partition", - .groups = part_attr_groups, - .release = part_release, -}; - -static void delete_partition_rcu_cb(struct rcu_head *head) -{ - struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); - - part->start_sect = 0; - part->nr_sects = 0; - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -void __delete_partition(struct hd_struct *part) -{ - call_rcu(&part->rcu_head, delete_partition_rcu_cb); -} - -void delete_partition(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = disk->part_tbl; - struct hd_struct *part; - - if (partno >= ptbl->len) - return; - - part = ptbl->part[partno]; - if (!part) - return; - - blk_free_devt(part_devt(part)); - rcu_assign_pointer(ptbl->part[partno], NULL); - rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->holder_dir); - device_del(part_to_dev(part)); - - hd_struct_put(part); -} - -static ssize_t whole_disk_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} -static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, - whole_disk_show, NULL); - -struct hd_struct *add_partition(struct gendisk *disk, int partno, - sector_t start, sector_t len, int flags, - struct partition_meta_info *info) -{ - struct hd_struct *p; - dev_t devt = MKDEV(0, 0); - struct device *ddev = disk_to_dev(disk); - struct device *pdev; - struct disk_part_tbl *ptbl; - const char *dname; - int err; - - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = disk->part_tbl; - - if (ptbl->part[partno]) - return ERR_PTR(-EBUSY); - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - - if (!init_part_stats(p)) { - err = -ENOMEM; - goto out_free; - } - pdev = part_to_dev(p); - - p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); - p->nr_sects = len; - p->partno = partno; - p->policy = get_disk_ro(disk); - - if (info) { - struct partition_meta_info *pinfo = alloc_part_info(disk); - if (!pinfo) - goto out_free_stats; - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; - } - - dname = dev_name(ddev); - if (isdigit(dname[strlen(dname) - 1])) - dev_set_name(pdev, "%sp%d", dname, partno); - else - dev_set_name(pdev, "%s%d", dname, partno); - - device_initialize(pdev); - pdev->class = &block_class; - pdev->type = &part_type; - pdev->parent = ddev; - - err = blk_alloc_devt(p, &devt); - if (err) - goto out_free_info; - pdev->devt = devt; - - /* delay uevent until 'holders' subdir is created */ - dev_set_uevent_suppress(pdev, 1); - err = device_add(pdev); - if (err) - goto out_put; - - err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) - goto out_del; - - dev_set_uevent_suppress(pdev, 0); - if (flags & ADDPART_FLAG_WHOLEDISK) { - err = device_create_file(pdev, &dev_attr_whole_disk); - if (err) - goto out_del; - } - - /* everything is up and running, commence */ - rcu_assign_pointer(ptbl->part[partno], p); - - /* suppress uevent if the disk suppresses it */ - if (!dev_get_uevent_suppress(ddev)) - kobject_uevent(&pdev->kobj, KOBJ_ADD); - - hd_ref_init(p); - return p; - -out_free_info: - free_part_info(p); -out_free_stats: - free_part_stats(p); -out_free: - kfree(p); - return ERR_PTR(err); -out_del: - kobject_put(p->holder_dir); - device_del(pdev); -out_put: - put_device(pdev); - blk_free_devt(devt); - return ERR_PTR(err); -} - -static bool disk_unlock_native_capacity(struct gendisk *disk) -{ - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { - printk(KERN_CONT "truncated\n"); - return false; - } -} - -int rescan_partitions(struct gendisk *disk, struct block_device *bdev) -{ - struct parsed_partitions *state = NULL; - struct disk_part_iter piter; - struct hd_struct *part; - int p, highest, res; -rescan: - if (state && !IS_ERR(state)) { - kfree(state); - state = NULL; - } - - if (bdev->bd_part_count) - return -EBUSY; - res = invalidate_partition(disk, 0); - if (res) - return res; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part->partno); - disk_part_iter_exit(&piter); - - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) - return 0; - if (IS_ERR(state)) { - /* - * I/O error reading the partition table. If any - * partition code tried to read beyond EOD, retry - * after unlocking native capacity. - */ - if (PTR_ERR(state) == -ENOSPC) { - printk(KERN_WARNING "%s: partition table beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - goto rescan; - } - return -EIO; - } - /* - * If any partition code tried to read beyond EOD, try - * unlocking native capacity even if partition table is - * successfully read as we could be missing some partitions. - */ - if (state->access_beyond_eod) { - printk(KERN_WARNING - "%s: partition table partially beyond EOD, ", - disk->disk_name); - if (disk_unlock_native_capacity(disk)) - goto rescan; - } - - /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - - /* Detect the highest partition number and preallocate - * disk->part_tbl. This is an optimization and not strictly - * necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - - disk_expand_part_tbl(disk, highest); - - /* add partitions */ - for (p = 1; p < state->limit; p++) { - sector_t size, from; - struct partition_meta_info *info = NULL; - - size = state->parts[p].size; - if (!size) - continue; - - from = state->parts[p].from; - if (from >= get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d start %llu is beyond EOD, ", - disk->disk_name, p, (unsigned long long) from); - if (disk_unlock_native_capacity(disk)) - goto rescan; - continue; - } - - if (from + size > get_capacity(disk)) { - printk(KERN_WARNING - "%s: p%d size %llu extends beyond EOD, ", - disk->disk_name, p, (unsigned long long) size); - - if (disk_unlock_native_capacity(disk)) { - /* free state and restart */ - goto rescan; - } else { - /* - * we can not ignore partitions of broken tables - * created by for example camera firmware, but - * we limit them to the end of the disk to avoid - * creating invalid block devices - */ - size = get_capacity(disk) - from; - } - } - - if (state->parts[p].has_info) - info = &state->parts[p].info; - part = add_partition(disk, p, from, size, - state->parts[p].flags, - &state->parts[p].info); - if (IS_ERR(part)) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); - continue; - } -#ifdef CONFIG_BLK_DEV_MD - if (state->parts[p].flags & ADDPART_FLAG_RAID) - md_autodetect_dev(part_to_dev(part)->devt); -#endif - } - kfree(state); - return 0; -} - -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - struct page *page; - - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); - if (!IS_ERR(page)) { - if (PageError(page)) - goto fail; - p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); -fail: - page_cache_release(page); - } - p->v = NULL; - return NULL; -} - -EXPORT_SYMBOL(read_dev_sector); diff --git a/fs/partitions/check.h b/fs/partitions/check.h deleted file mode 100644 index d68bf4dc3bc..00000000000 --- a/fs/partitions/check.h +++ /dev/null @@ -1,49 +0,0 @@ -#include <linux/pagemap.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> - -/* - * add_gd_partition adds a partitions details to the devices partition - * description. - */ -struct parsed_partitions { - struct block_device *bdev; - char name[BDEVNAME_SIZE]; - struct { - sector_t from; - sector_t size; - int flags; - bool has_info; - struct partition_meta_info info; - } parts[DISK_MAX_PARTS]; - int next; - int limit; - bool access_beyond_eod; - char *pp_buf; -}; - -static inline void *read_part_sector(struct parsed_partitions *state, - sector_t n, Sector *p) -{ - if (n >= get_capacity(state->bdev->bd_disk)) { - state->access_beyond_eod = true; - return NULL; - } - return read_dev_sector(state->bdev, n, p); -} - -static inline void -put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) -{ - if (n < p->limit) { - char tmp[1 + BDEVNAME_SIZE + 10 + 1]; - - p->parts[n].from = from; - p->parts[n].size = size; - snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); - strlcat(p->pp_buf, tmp, PAGE_SIZE); - } -} - -extern int warn_no_part; - diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c deleted file mode 100644 index 6296b403c67..00000000000 --- a/fs/partitions/efi.c +++ /dev/null @@ -1,675 +0,0 @@ -/************************************************************ - * EFI GUID Partition Table handling - * - * http://www.uefi.org/specs/ - * http://www.intel.com/technology/efi/ - * - * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> - * Copyright 2000,2001,2002,2004 Dell Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * TODO: - * - * Changelog: - * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> - * - test for valid PMBR and valid PGPT before ever reading - * AGPT, allow override with 'gpt' kernel command line option. - * - check for first/last_usable_lba outside of size of disk - * - * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Ported to 2.5.7-pre1 and 2.5.7-dj2 - * - Applied patch to avoid fault in alternate header handling - * - cleaned up find_valid_gpt - * - On-disk structure and copy in memory is *always* LE now - - * swab fields as needed - * - remove print_gpt_header() - * - only use first max_p partition entries, to keep the kernel minor number - * and partition numbers tied. - * - * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Removed __PRIPTR_PREFIX - not being used - * - * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> - * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied - * - * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Added compare_gpts(). - * - moved le_efi_guid_to_cpus() back into this file. GPT is the only - * thing that keeps EFI GUIDs on disk. - * - Changed gpt structure names and members to be simpler and more Linux-like. - * - * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck - * - * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Changed function comments to DocBook style per Andreas Dilger suggestion. - * - * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Change read_lba() to use the page cache per Al Viro's work. - * - print u64s properly on all architectures - * - fixed debug_printk(), now Dprintk() - * - * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> - * - Style cleanups - * - made most functions static - * - Endianness addition - * - remove test for second alternate header, as it's not per spec, - * and is unnecessary. There's now a method to read/write the last - * sector of an odd-sized disk from user space. No tools have ever - * been released which used this code, so it's effectively dead. - * - Per Asit Mallick of Intel, added a test for a valid PMBR. - * - Added kernel command line option 'gpt' to override valid PMBR test. - * - * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> - * - added devfs volume UUID support (/dev/volumes/uuids) for - * mounting file systems by the partition GUID. - * - * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Moved crc32() to linux/lib, added efi_crc32(). - * - * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Replaced Intel's CRC32 function with an equivalent - * non-license-restricted version. - * - * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Fixed the last_lba() call to return the proper last block - * - * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> - * - Thanks to Andries Brouwer for his debugging assistance. - * - Code works, detects all the partitions. - * - ************************************************************/ -#include <linux/crc32.h> -#include <linux/ctype.h> -#include <linux/math64.h> -#include <linux/slab.h> -#include "check.h" -#include "efi.h" - -/* This allows a kernel command line option 'gpt' to override - * the test for invalid PMBR. Not __initdata because reloading - * the partition tables happens after init too. - */ -static int force_gpt; -static int __init -force_gpt_fn(char *str) -{ - force_gpt = 1; - return 1; -} -__setup("gpt", force_gpt_fn); - - -/** - * efi_crc32() - EFI version of crc32 function - * @buf: buffer to calculate crc32 of - * @len - length of buf - * - * Description: Returns EFI-style CRC32 value for @buf - * - * This function uses the little endian Ethernet polynomial - * but seeds the function with ~0, and xor's with ~0 at the end. - * Note, the EFI Specification, v1.02, has a reference to - * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). - */ -static inline u32 -efi_crc32(const void *buf, unsigned long len) -{ - return (crc32(~0L, buf, len) ^ ~0L); -} - -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; -} - -static inline int -pmbr_part_valid(struct partition *part) -{ - if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sect) == 1UL) - return 1; - return 0; -} - -/** - * is_pmbr_valid(): test Protective MBR for validity - * @mbr: pointer to a legacy mbr structure - * - * Description: Returns 1 if PMBR is valid, 0 otherwise. - * Validity depends on two things: - * 1) MSDOS signature is in the last two bytes of the MBR - * 2) One partition of type 0xEE is found - */ -static int -is_pmbr_valid(legacy_mbr *mbr) -{ - int i; - if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) - return 0; - for (i = 0; i < 4; i++) - if (pmbr_part_valid(&mbr->partition_record[i])) - return 1; - return 0; -} - -/** - * read_lba(): Read bytes from disk, starting at given LBA - * @state - * @lba - * @buffer - * @size_t - * - * Description: Reads @count bytes from @state->bdev into @buffer. - * Returns number of bytes read on success, 0 on error. - */ -static size_t read_lba(struct parsed_partitions *state, - u64 lba, u8 *buffer, size_t count) -{ - size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); - - if (!buffer || lba > last_lba(bdev)) - return 0; - - while (count) { - int copied = 512; - Sector sect; - unsigned char *data = read_part_sector(state, n++, §); - if (!data) - break; - if (copied > count) - copied = count; - memcpy(buffer, data, copied); - put_dev_sector(sect); - buffer += copied; - totalreadcount +=copied; - count -= copied; - } - return totalreadcount; -} - -/** - * alloc_read_gpt_entries(): reads partition entries from disk - * @state - * @gpt - GPT header - * - * Description: Returns ptes on success, NULL on error. - * Allocates space for PTEs based on information found in @gpt. - * Notes: remember to free pte when you're done! - */ -static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, - gpt_header *gpt) -{ - size_t count; - gpt_entry *pte; - - if (!gpt) - return NULL; - - count = le32_to_cpu(gpt->num_partition_entries) * - le32_to_cpu(gpt->sizeof_partition_entry); - if (!count) - return NULL; - pte = kzalloc(count, GFP_KERNEL); - if (!pte) - return NULL; - - if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), - (u8 *) pte, - count) < count) { - kfree(pte); - pte=NULL; - return NULL; - } - return pte; -} - -/** - * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @state - * @lba is the Logical Block Address of the partition table - * - * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. - * Note: remember to free gpt when finished with it. - */ -static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, - u64 lba) -{ - gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); - - gpt = kzalloc(ssz, GFP_KERNEL); - if (!gpt) - return NULL; - - if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { - kfree(gpt); - gpt=NULL; - return NULL; - } - - return gpt; -} - -/** - * is_gpt_valid() - tests one GPT header and PTEs for validity - * @state - * @lba is the logical block address of the GPT header to test - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. - * - * Description: returns 1 if valid, 0 on error. - * If valid, returns pointers to newly allocated GPT header and PTEs. - */ -static int is_gpt_valid(struct parsed_partitions *state, u64 lba, - gpt_header **gpt, gpt_entry **ptes) -{ - u32 crc, origcrc; - u64 lastlba; - - if (!ptes) - return 0; - if (!(*gpt = alloc_read_gpt_header(state, lba))) - return 0; - - /* Check the GUID Partition Table signature */ - if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { - pr_debug("GUID Partition Table Header signature is wrong:" - "%lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->signature), - (unsigned long long)GPT_HEADER_SIGNATURE); - goto fail; - } - - /* Check the GUID Partition Table header size */ - if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { - pr_debug("GUID Partition Table Header size is wrong: %u > %u\n", - le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); - goto fail; - } - - /* Check the GUID Partition Table CRC */ - origcrc = le32_to_cpu((*gpt)->header_crc32); - (*gpt)->header_crc32 = 0; - crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); - - if (crc != origcrc) { - pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", - crc, origcrc); - goto fail; - } - (*gpt)->header_crc32 = cpu_to_le32(origcrc); - - /* Check that the my_lba entry points to the LBA that contains - * the GUID Partition Table */ - if (le64_to_cpu((*gpt)->my_lba) != lba) { - pr_debug("GPT my_lba incorrect: %lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->my_lba), - (unsigned long long)lba); - goto fail; - } - - /* Check the first_usable_lba and last_usable_lba are - * within the disk. - */ - lastlba = last_lba(state->bdev); - if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { - pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), - (unsigned long long)lastlba); - goto fail; - } - if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { - pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), - (unsigned long long)lastlba); - goto fail; - } - - /* Check that sizeof_partition_entry has the correct value */ - if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { - pr_debug("GUID Partitition Entry Size check failed.\n"); - goto fail; - } - - if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) - goto fail; - - /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); - - if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { - pr_debug("GUID Partitition Entry Array CRC check failed.\n"); - goto fail_ptes; - } - - /* We're done, all's well */ - return 1; - - fail_ptes: - kfree(*ptes); - *ptes = NULL; - fail: - kfree(*gpt); - *gpt = NULL; - return 0; -} - -/** - * is_pte_valid() - tests one PTE for validity - * @pte is the pte to check - * @lastlba is last lba of the disk - * - * Description: returns 1 if valid, 0 on error. - */ -static inline int -is_pte_valid(const gpt_entry *pte, const u64 lastlba) -{ - if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || - le64_to_cpu(pte->starting_lba) > lastlba || - le64_to_cpu(pte->ending_lba) > lastlba) - return 0; - return 1; -} - -/** - * compare_gpts() - Search disk for valid GPT headers and PTEs - * @pgpt is the primary GPT header - * @agpt is the alternate GPT header - * @lastlba is the last LBA number - * Description: Returns nothing. Sanity checks pgpt and agpt fields - * and prints warnings on discrepancies. - * - */ -static void -compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) -{ - int error_found = 0; - if (!pgpt || !agpt) - return; - if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { - printk(KERN_WARNING - "GPT:Primary header LBA != Alt. header alternate_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->my_lba), - (unsigned long long)le64_to_cpu(agpt->alternate_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { - printk(KERN_WARNING - "GPT:Primary header alternate_lba != Alt. header my_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->alternate_lba), - (unsigned long long)le64_to_cpu(agpt->my_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->first_usable_lba) != - le64_to_cpu(agpt->first_usable_lba)) { - printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), - (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); - error_found++; - } - if (le64_to_cpu(pgpt->last_usable_lba) != - le64_to_cpu(agpt->last_usable_lba)) { - printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), - (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); - error_found++; - } - if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { - printk(KERN_WARNING "GPT:disk_guids don't match.\n"); - error_found++; - } - if (le32_to_cpu(pgpt->num_partition_entries) != - le32_to_cpu(agpt->num_partition_entries)) { - printk(KERN_WARNING "GPT:num_partition_entries don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->num_partition_entries), - le32_to_cpu(agpt->num_partition_entries)); - error_found++; - } - if (le32_to_cpu(pgpt->sizeof_partition_entry) != - le32_to_cpu(agpt->sizeof_partition_entry)) { - printk(KERN_WARNING - "GPT:sizeof_partition_entry values don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->sizeof_partition_entry), - le32_to_cpu(agpt->sizeof_partition_entry)); - error_found++; - } - if (le32_to_cpu(pgpt->partition_entry_array_crc32) != - le32_to_cpu(agpt->partition_entry_array_crc32)) { - printk(KERN_WARNING - "GPT:partition_entry_array_crc32 values don't match: " - "0x%x != 0x%x\n", - le32_to_cpu(pgpt->partition_entry_array_crc32), - le32_to_cpu(agpt->partition_entry_array_crc32)); - error_found++; - } - if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(pgpt->alternate_lba), - (unsigned long long)lastlba); - error_found++; - } - - if (le64_to_cpu(agpt->my_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Alternate GPT header not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", - (unsigned long long)le64_to_cpu(agpt->my_lba), - (unsigned long long)lastlba); - error_found++; - } - - if (error_found) - printk(KERN_WARNING - "GPT: Use GNU Parted to correct GPT errors.\n"); - return; -} - -/** - * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @state - * @gpt is a GPT header ptr, filled on return. - * @ptes is a PTEs ptr, filled on return. - * Description: Returns 1 if valid, 0 on error. - * If valid, returns pointers to newly allocated GPT header and PTEs. - * Validity depends on PMBR being valid (or being overridden by the - * 'gpt' kernel command line option) and finding either the Primary - * GPT header and PTEs valid, or the Alternate GPT header and PTEs - * valid. If the Primary GPT header is not valid, the Alternate GPT header - * is not checked unless the 'gpt' kernel command line option is passed. - * This protects against devices which misreport their size, and forces - * the user to decide to use the Alternate GPT. - */ -static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, - gpt_entry **ptes) -{ - int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; - gpt_header *pgpt = NULL, *agpt = NULL; - gpt_entry *pptes = NULL, *aptes = NULL; - legacy_mbr *legacymbr; - u64 lastlba; - - if (!ptes) - return 0; - - lastlba = last_lba(state->bdev); - if (!force_gpt) { - /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); - if (legacymbr) { - read_lba(state, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); - kfree(legacymbr); - } - if (!good_pmbr) - goto fail; - } - - good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, - &pgpt, &pptes); - if (good_pgpt) - good_agpt = is_gpt_valid(state, - le64_to_cpu(pgpt->alternate_lba), - &agpt, &aptes); - if (!good_agpt && force_gpt) - good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); - - /* The obviously unsuccessful case */ - if (!good_pgpt && !good_agpt) - goto fail; - - compare_gpts(pgpt, agpt, lastlba); - - /* The good cases */ - if (good_pgpt) { - *gpt = pgpt; - *ptes = pptes; - kfree(agpt); - kfree(aptes); - if (!good_agpt) { - printk(KERN_WARNING - "Alternate GPT is invalid, " - "using primary GPT.\n"); - } - return 1; - } - else if (good_agpt) { - *gpt = agpt; - *ptes = aptes; - kfree(pgpt); - kfree(pptes); - printk(KERN_WARNING - "Primary GPT is invalid, using alternate GPT.\n"); - return 1; - } - - fail: - kfree(pgpt); - kfree(agpt); - kfree(pptes); - kfree(aptes); - *gpt = NULL; - *ptes = NULL; - return 0; -} - -/** - * efi_partition(struct parsed_partitions *state) - * @state - * - * Description: called from check.c, if the disk contains GPT - * partitions, sets up partition entries in the kernel. - * - * If the first block on the disk is a legacy MBR, - * it will get handled by msdos_partition(). - * If it's a Protective MBR, we'll handle it here. - * - * We do not create a Linux partition for GPT, but - * only for the actual data partitions. - * Returns: - * -1 if unable to read the partition table - * 0 if this isn't our partition table - * 1 if successful - * - */ -int efi_partition(struct parsed_partitions *state) -{ - gpt_header *gpt = NULL; - gpt_entry *ptes = NULL; - u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; - u8 unparsed_guid[37]; - - if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { - kfree(gpt); - kfree(ptes); - return 0; - } - - pr_debug("GUID Partition Table is valid! Yea!\n"); - - for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { - struct partition_meta_info *info; - unsigned label_count = 0; - unsigned label_max; - u64 start = le64_to_cpu(ptes[i].starting_lba); - u64 size = le64_to_cpu(ptes[i].ending_lba) - - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) - continue; - - put_partition(state, i+1, start * ssz, size * ssz); - - /* If this is a RAID volume, tell md */ - if (!efi_guidcmp(ptes[i].partition_type_guid, - PARTITION_LINUX_RAID_GUID)) - state->parts[i + 1].flags = ADDPART_FLAG_RAID; - - info = &state->parts[i + 1].info; - /* Instead of doing a manual swap to big endian, reuse the - * common ASCII hex format as the interim. - */ - efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); - part_pack_uuid(unparsed_guid, info->uuid); - - /* Naively convert UTF16-LE to 7 bits. */ - label_max = min(sizeof(info->volname) - 1, - sizeof(ptes[i].partition_name)); - info->volname[label_max] = 0; - while (label_count < label_max) { - u8 c = ptes[i].partition_name[label_count] & 0xff; - if (c && !isprint(c)) - c = '!'; - info->volname[label_count] = c; - label_count++; - } - state->parts[i + 1].has_info = true; - } - kfree(ptes); - kfree(gpt); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h deleted file mode 100644 index b69ab729558..00000000000 --- a/fs/partitions/efi.h +++ /dev/null @@ -1,134 +0,0 @@ -/************************************************************ - * EFI GUID Partition Table - * Per Intel EFI Specification v1.02 - * http://developer.intel.com/technology/efi/efi.htm - * - * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000 - * Copyright 2000,2001 Dell Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - ************************************************************/ - -#ifndef FS_PART_EFI_H_INCLUDED -#define FS_PART_EFI_H_INCLUDED - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/string.h> -#include <linux/efi.h> - -#define MSDOS_MBR_SIGNATURE 0xaa55 -#define EFI_PMBR_OSTYPE_EFI 0xEF -#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE - -#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL -#define GPT_HEADER_REVISION_V1 0x00010000 -#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 - -#define PARTITION_SYSTEM_GUID \ - EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ - 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) -#define LEGACY_MBR_PARTITION_GUID \ - EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ - 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) -#define PARTITION_MSFT_RESERVED_GUID \ - EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ - 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) -#define PARTITION_BASIC_DATA_GUID \ - EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ - 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) -#define PARTITION_LINUX_RAID_GUID \ - EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ - 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) -#define PARTITION_LINUX_SWAP_GUID \ - EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ - 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) -#define PARTITION_LINUX_LVM_GUID \ - EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ - 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) - -typedef struct _gpt_header { - __le64 signature; - __le32 revision; - __le32 header_size; - __le32 header_crc32; - __le32 reserved1; - __le64 my_lba; - __le64 alternate_lba; - __le64 first_usable_lba; - __le64 last_usable_lba; - efi_guid_t disk_guid; - __le64 partition_entry_lba; - __le32 num_partition_entries; - __le32 sizeof_partition_entry; - __le32 partition_entry_array_crc32; - - /* The rest of the logical block is reserved by UEFI and must be zero. - * EFI standard handles this by: - * - * uint8_t reserved2[ BlockSize - 92 ]; - */ -} __attribute__ ((packed)) gpt_header; - -typedef struct _gpt_entry_attributes { - u64 required_to_function:1; - u64 reserved:47; - u64 type_guid_specific:16; -} __attribute__ ((packed)) gpt_entry_attributes; - -typedef struct _gpt_entry { - efi_guid_t partition_type_guid; - efi_guid_t unique_partition_guid; - __le64 starting_lba; - __le64 ending_lba; - gpt_entry_attributes attributes; - efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; -} __attribute__ ((packed)) gpt_entry; - -typedef struct _legacy_mbr { - u8 boot_code[440]; - __le32 unique_mbr_signature; - __le16 unknown; - struct partition partition_record[4]; - __le16 signature; -} __attribute__ ((packed)) legacy_mbr; - -/* Functions */ -extern int efi_partition(struct parsed_partitions *state); - -#endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * -------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c deleted file mode 100644 index d513a07f44b..00000000000 --- a/fs/partitions/ibm.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * File...........: linux/fs/partitions/ibm.c - * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com> - * Volker Sameske <sameske@de.ibm.com> - * Bugreports.to..: <Linux390@de.ibm.com> - * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 - */ - -#include <linux/buffer_head.h> -#include <linux/hdreg.h> -#include <linux/slab.h> -#include <asm/dasd.h> -#include <asm/ebcdic.h> -#include <asm/uaccess.h> -#include <asm/vtoc.h> - -#include "check.h" -#include "ibm.h" - -/* - * compute the block number from a - * cyl-cyl-head-head structure - */ -static sector_t -cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) { - - sector_t cyl; - __u16 head; - - /*decode cylinder and heads for large volumes */ - cyl = ptr->hh & 0xFFF0; - cyl <<= 12; - cyl |= ptr->cc; - head = ptr->hh & 0x000F; - return cyl * geo->heads * geo->sectors + - head * geo->sectors; -} - -/* - * compute the block number from a - * cyl-cyl-head-head-block structure - */ -static sector_t -cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { - - sector_t cyl; - __u16 head; - - /*decode cylinder and heads for large volumes */ - cyl = ptr->hh & 0xFFF0; - cyl <<= 12; - cyl |= ptr->cc; - head = ptr->hh & 0x000F; - return cyl * geo->heads * geo->sectors + - head * geo->sectors + - ptr->b; -} - -/* - */ -int ibm_partition(struct parsed_partitions *state) -{ - struct block_device *bdev = state->bdev; - int blocksize, res; - loff_t i_size, offset, size, fmt_size; - dasd_information2_t *info; - struct hd_geometry *geo; - char type[5] = {0,}; - char name[7] = {0,}; - union label_t { - struct vtoc_volume_label_cdl vol; - struct vtoc_volume_label_ldl lnx; - struct vtoc_cms_label cms; - } *label; - unsigned char *data; - Sector sect; - sector_t labelsect; - char tmp[64]; - - res = 0; - blocksize = bdev_logical_block_size(bdev); - if (blocksize <= 0) - goto out_exit; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) - goto out_exit; - - info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); - if (info == NULL) - goto out_exit; - geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); - if (geo == NULL) - goto out_nogeo; - label = kmalloc(sizeof(union label_t), GFP_KERNEL); - if (label == NULL) - goto out_nolab; - - if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 || - ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) - goto out_freeall; - - /* - * Special case for FBA disks: label sector does not depend on - * blocksize. - */ - if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || - (info->cu_type == 0x3880 && info->dev_type == 0x3370)) - labelsect = info->label_block; - else - labelsect = info->label_block * (blocksize >> 9); - - /* - * Get volume label, extract name and type. - */ - data = read_part_sector(state, labelsect, §); - if (data == NULL) - goto out_readerr; - - memcpy(label, data, sizeof(union label_t)); - put_dev_sector(sect); - - if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) { - strncpy(type, label->vol.vollbl, 4); - strncpy(name, label->vol.volid, 6); - } else { - strncpy(type, label->lnx.vollbl, 4); - strncpy(name, label->lnx.volid, 6); - } - EBCASC(type, 4); - EBCASC(name, 6); - - res = 1; - - /* - * Three different formats: LDL, CDL and unformated disk - * - * identified by info->format - * - * unformated disks we do not have to care about - */ - if (info->format == DASD_FORMAT_LDL) { - if (strncmp(type, "CMS1", 4) == 0) { - /* - * VM style CMS1 labeled disk - */ - blocksize = label->cms.block_size; - if (label->cms.disk_offset != 0) { - snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* disk is reserved minidisk */ - offset = label->cms.disk_offset; - size = (label->cms.block_count - 1) - * (blocksize >> 9); - } else { - snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - offset = (info->label_block + 1); - size = label->cms.block_count - * (blocksize >> 9); - } - put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); - } else { - if (strncmp(type, "LNX1", 4) == 0) { - snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - if (label->lnx.ldl_version == 0xf2) { - fmt_size = label->lnx.formatted_blocks - * (blocksize >> 9); - } else if (!strcmp(info->type, "ECKD")) { - /* formated w/o large volume support */ - fmt_size = geo->cylinders * geo->heads - * geo->sectors * (blocksize >> 9); - } else { - /* old label and no usable disk geometry - * (e.g. DIAG) */ - fmt_size = i_size >> 9; - } - size = i_size >> 9; - if (fmt_size < size) - size = fmt_size; - offset = (info->label_block + 1); - } else { - /* unlabeled disk */ - strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; - offset = (info->label_block + 1); - } - put_partition(state, 1, offset*(blocksize >> 9), - size-offset*(blocksize >> 9)); - } - } else if (info->format == DASD_FORMAT_CDL) { - /* - * New style CDL formatted disk - */ - sector_t blk; - int counter; - - /* - * check if VOL1 label is available - * if not, something is wrong, skipping partition detection - */ - if (strncmp(type, "VOL1", 4) == 0) { - snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* - * get block number and read then go through format1 - * labels - */ - blk = cchhb2blk(&label->vol.vtoc, geo) + 1; - counter = 0; - data = read_part_sector(state, blk * (blocksize/512), - §); - while (data != NULL) { - struct vtoc_format1_label f1; - - memcpy(&f1, data, - sizeof(struct vtoc_format1_label)); - put_dev_sector(sect); - - /* skip FMT4 / FMT5 / FMT7 labels */ - if (f1.DS1FMTID == _ascebc['4'] - || f1.DS1FMTID == _ascebc['5'] - || f1.DS1FMTID == _ascebc['7'] - || f1.DS1FMTID == _ascebc['9']) { - blk++; - data = read_part_sector(state, - blk * (blocksize/512), §); - continue; - } - - /* only FMT1 and 8 labels valid at this point */ - if (f1.DS1FMTID != _ascebc['1'] && - f1.DS1FMTID != _ascebc['8']) - break; - - /* OK, we got valid partition data */ - offset = cchh2blk(&f1.DS1EXT1.llimit, geo); - size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - - offset + geo->sectors; - if (counter >= state->limit) - break; - put_partition(state, counter + 1, - offset * (blocksize >> 9), - size * (blocksize >> 9)); - counter++; - blk++; - data = read_part_sector(state, - blk * (blocksize/512), §); - } - - if (!data) - /* Are we not supposed to report this ? */ - goto out_readerr; - } else - printk(KERN_WARNING "Warning, expected Label VOL1 not " - "found, treating as CDL formated Disk"); - - } - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - goto out_freeall; - - -out_readerr: - res = -1; -out_freeall: - kfree(label); -out_nolab: - kfree(geo); -out_nogeo: - kfree(info); -out_exit: - return res; -} diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h deleted file mode 100644 index 08fb0804a81..00000000000 --- a/fs/partitions/ibm.h +++ /dev/null @@ -1 +0,0 @@ -int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c deleted file mode 100644 index 0ea19312706..00000000000 --- a/fs/partitions/karma.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * fs/partitions/karma.c - * Rio Karma partition info. - * - * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) - * based on osf.c - */ - -#include "check.h" -#include "karma.h" - -int karma_partition(struct parsed_partitions *state) -{ - int i; - int slot = 1; - Sector sect; - unsigned char *data; - struct disklabel { - u8 d_reserved[270]; - struct d_partition { - __le32 p_res; - u8 p_fstype; - u8 p_res2[3]; - __le32 p_offset; - __le32 p_size; - } d_partitions[2]; - u8 d_blank[208]; - __le16 d_magic; - } __attribute__((packed)) *label; - struct d_partition *p; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - label = (struct disklabel *)data; - if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { - put_dev_sector(sect); - return 0; - } - - p = label->d_partitions; - for (i = 0 ; i < 2; i++, p++) { - if (slot == state->limit) - break; - - if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { - put_partition(state, slot, le32_to_cpu(p->p_offset), - le32_to_cpu(p->p_size)); - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} - diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h deleted file mode 100644 index c764b2e9df2..00000000000 --- a/fs/partitions/karma.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/karma.h - */ - -#define KARMA_LABEL_MAGIC 0xAB56 - -int karma_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c deleted file mode 100644 index bd8ae788f68..00000000000 --- a/fs/partitions/ldm.c +++ /dev/null @@ -1,1570 +0,0 @@ -/** - * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) - * - * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> - * - * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads - * - * This program is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation; either version 2 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program (in the main directory of the source in the file COPYING); if - * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, - * Boston, MA 02111-1307 USA - */ - -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/stringify.h> -#include <linux/kernel.h> -#include "ldm.h" -#include "check.h" -#include "msdos.h" - -/** - * ldm_debug/info/error/crit - Output an error message - * @f: A printf format string containing the message - * @...: Variables to substitute into @f - * - * ldm_debug() writes a DEBUG level message to the syslog but only if the - * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. - */ -#ifndef CONFIG_LDM_DEBUG -#define ldm_debug(...) do {} while (0) -#else -#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) -#endif - -#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) -#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) -#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) - -static __printf(3, 4) -void _ldm_printk(const char *level, const char *function, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start (args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk("%s%s(): %pV\n", level, function, &vaf); - - va_end(args); -} - -/** - * ldm_parse_hexbyte - Convert a ASCII hex number to a byte - * @src: Pointer to at least 2 characters to convert. - * - * Convert a two character ASCII hex string to a number. - * - * Return: 0-255 Success, the byte was parsed correctly - * -1 Error, an invalid character was supplied - */ -static int ldm_parse_hexbyte (const u8 *src) -{ - unsigned int x; /* For correct wrapping */ - int h; - - /* high part */ - x = h = hex_to_bin(src[0]); - if (h < 0) - return -1; - - /* low part */ - h = hex_to_bin(src[1]); - if (h < 0) - return -1; - - return (x << 4) + h; -} - -/** - * ldm_parse_guid - Convert GUID from ASCII to binary - * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba - * @dest: Memory block to hold binary GUID (16 bytes) - * - * N.B. The GUID need not be NULL terminated. - * - * Return: 'true' @dest contains binary GUID - * 'false' @dest contents are undefined - */ -static bool ldm_parse_guid (const u8 *src, u8 *dest) -{ - static const int size[] = { 4, 2, 2, 2, 6 }; - int i, j, v; - - if (src[8] != '-' || src[13] != '-' || - src[18] != '-' || src[23] != '-') - return false; - - for (j = 0; j < 5; j++, src++) - for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) - if ((v = ldm_parse_hexbyte (src)) < 0) - return false; - - return true; -} - -/** - * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure - * @data: Raw database PRIVHEAD structure loaded from the device - * @ph: In-memory privhead structure in which to return parsed information - * - * This parses the LDM database PRIVHEAD structure supplied in @data and - * sets up the in-memory privhead structure @ph with the obtained information. - * - * Return: 'true' @ph contains the PRIVHEAD data - * 'false' @ph contents are undefined - */ -static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) -{ - bool is_vista = false; - - BUG_ON(!data || !ph); - if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { - ldm_error("Cannot find PRIVHEAD structure. LDM database is" - " corrupt. Aborting."); - return false; - } - ph->ver_major = get_unaligned_be16(data + 0x000C); - ph->ver_minor = get_unaligned_be16(data + 0x000E); - ph->logical_disk_start = get_unaligned_be64(data + 0x011B); - ph->logical_disk_size = get_unaligned_be64(data + 0x0123); - ph->config_start = get_unaligned_be64(data + 0x012B); - ph->config_size = get_unaligned_be64(data + 0x0133); - /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ - if (ph->ver_major == 2 && ph->ver_minor == 12) - is_vista = true; - if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { - ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." - " Aborting.", ph->ver_major, ph->ver_minor); - return false; - } - ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, - ph->ver_minor, is_vista ? "Vista" : "2000/XP"); - if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ - /* Warn the user and continue, carefully. */ - ldm_info("Database is normally %u bytes, it claims to " - "be %llu bytes.", LDM_DB_SIZE, - (unsigned long long)ph->config_size); - } - if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + - ph->logical_disk_size > ph->config_start)) { - ldm_error("PRIVHEAD disk size doesn't match real disk size"); - return false; - } - if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { - ldm_error("PRIVHEAD contains an invalid GUID."); - return false; - } - ldm_debug("Parsed PRIVHEAD successfully."); - return true; -} - -/** - * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure - * @data: Raw database TOCBLOCK structure loaded from the device - * @toc: In-memory toc structure in which to return parsed information - * - * This parses the LDM Database TOCBLOCK (table of contents) structure supplied - * in @data and sets up the in-memory tocblock structure @toc with the obtained - * information. - * - * N.B. The *_start and *_size values returned in @toc are not range-checked. - * - * Return: 'true' @toc contains the TOCBLOCK data - * 'false' @toc contents are undefined - */ -static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) -{ - BUG_ON (!data || !toc); - - if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { - ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); - return false; - } - strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); - toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; - toc->bitmap1_start = get_unaligned_be64(data + 0x2E); - toc->bitmap1_size = get_unaligned_be64(data + 0x36); - - if (strncmp (toc->bitmap1_name, TOC_BITMAP1, - sizeof (toc->bitmap1_name)) != 0) { - ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", - TOC_BITMAP1, toc->bitmap1_name); - return false; - } - strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); - toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; - toc->bitmap2_start = get_unaligned_be64(data + 0x50); - toc->bitmap2_size = get_unaligned_be64(data + 0x58); - if (strncmp (toc->bitmap2_name, TOC_BITMAP2, - sizeof (toc->bitmap2_name)) != 0) { - ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", - TOC_BITMAP2, toc->bitmap2_name); - return false; - } - ldm_debug ("Parsed TOCBLOCK successfully."); - return true; -} - -/** - * ldm_parse_vmdb - Read the LDM Database VMDB structure - * @data: Raw database VMDB structure loaded from the device - * @vm: In-memory vmdb structure in which to return parsed information - * - * This parses the LDM Database VMDB structure supplied in @data and sets up - * the in-memory vmdb structure @vm with the obtained information. - * - * N.B. The *_start, *_size and *_seq values will be range-checked later. - * - * Return: 'true' @vm contains VMDB info - * 'false' @vm contents are undefined - */ -static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) -{ - BUG_ON (!data || !vm); - - if (MAGIC_VMDB != get_unaligned_be32(data)) { - ldm_crit ("Cannot find the VMDB, database may be corrupt."); - return false; - } - - vm->ver_major = get_unaligned_be16(data + 0x12); - vm->ver_minor = get_unaligned_be16(data + 0x14); - if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { - ldm_error ("Expected VMDB version %d.%d, got %d.%d. " - "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); - return false; - } - - vm->vblk_size = get_unaligned_be32(data + 0x08); - if (vm->vblk_size == 0) { - ldm_error ("Illegal VBLK size"); - return false; - } - - vm->vblk_offset = get_unaligned_be32(data + 0x0C); - vm->last_vblk_seq = get_unaligned_be32(data + 0x04); - - ldm_debug ("Parsed VMDB successfully."); - return true; -} - -/** - * ldm_compare_privheads - Compare two privhead objects - * @ph1: First privhead - * @ph2: Second privhead - * - * This compares the two privhead structures @ph1 and @ph2. - * - * Return: 'true' Identical - * 'false' Different - */ -static bool ldm_compare_privheads (const struct privhead *ph1, - const struct privhead *ph2) -{ - BUG_ON (!ph1 || !ph2); - - return ((ph1->ver_major == ph2->ver_major) && - (ph1->ver_minor == ph2->ver_minor) && - (ph1->logical_disk_start == ph2->logical_disk_start) && - (ph1->logical_disk_size == ph2->logical_disk_size) && - (ph1->config_start == ph2->config_start) && - (ph1->config_size == ph2->config_size) && - !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE)); -} - -/** - * ldm_compare_tocblocks - Compare two tocblock objects - * @toc1: First toc - * @toc2: Second toc - * - * This compares the two tocblock structures @toc1 and @toc2. - * - * Return: 'true' Identical - * 'false' Different - */ -static bool ldm_compare_tocblocks (const struct tocblock *toc1, - const struct tocblock *toc2) -{ - BUG_ON (!toc1 || !toc2); - - return ((toc1->bitmap1_start == toc2->bitmap1_start) && - (toc1->bitmap1_size == toc2->bitmap1_size) && - (toc1->bitmap2_start == toc2->bitmap2_start) && - (toc1->bitmap2_size == toc2->bitmap2_size) && - !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, - sizeof (toc1->bitmap1_name)) && - !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, - sizeof (toc1->bitmap2_name))); -} - -/** - * ldm_validate_privheads - Compare the primary privhead with its backups - * @state: Partition check state including device holding the LDM Database - * @ph1: Memory struct to fill with ph contents - * - * Read and compare all three privheads from disk. - * - * The privheads on disk show the size and location of the main disk area and - * the configuration area (the database). The values are range-checked against - * @hd, which contains the real size of the disk. - * - * Return: 'true' Success - * 'false' Error - */ -static bool ldm_validate_privheads(struct parsed_partitions *state, - struct privhead *ph1) -{ - static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; - struct privhead *ph[3] = { ph1 }; - Sector sect; - u8 *data; - bool result = false; - long num_sects; - int i; - - BUG_ON (!state || !ph1); - - ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); - ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); - if (!ph[1] || !ph[2]) { - ldm_crit ("Out of memory."); - goto out; - } - - /* off[1 & 2] are relative to ph[0]->config_start */ - ph[0]->config_start = 0; - - /* Read and parse privheads */ - for (i = 0; i < 3; i++) { - data = read_part_sector(state, ph[0]->config_start + off[i], - §); - if (!data) { - ldm_crit ("Disk read failed."); - goto out; - } - result = ldm_parse_privhead (data, ph[i]); - put_dev_sector (sect); - if (!result) { - ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ - if (i < 2) - goto out; /* Already logged */ - else - break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ - } - } - - num_sects = state->bdev->bd_inode->i_size >> 9; - - if ((ph[0]->config_start > num_sects) || - ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { - ldm_crit ("Database extends beyond the end of the disk."); - goto out; - } - - if ((ph[0]->logical_disk_start > ph[0]->config_start) || - ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) - > ph[0]->config_start)) { - ldm_crit ("Disk and database overlap."); - goto out; - } - - if (!ldm_compare_privheads (ph[0], ph[1])) { - ldm_crit ("Primary and backup PRIVHEADs don't match."); - goto out; - } - /* FIXME ignore this for now - if (!ldm_compare_privheads (ph[0], ph[2])) { - ldm_crit ("Primary and backup PRIVHEADs don't match."); - goto out; - }*/ - ldm_debug ("Validated PRIVHEADs successfully."); - result = true; -out: - kfree (ph[1]); - kfree (ph[2]); - return result; -} - -/** - * ldm_validate_tocblocks - Validate the table of contents and its backups - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database - * @ldb: Cache of the database structures - * - * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. - * - * The offsets and sizes of the configs are range-checked against a privhead. - * - * Return: 'true' @toc1 contains validated TOCBLOCK info - * 'false' @toc1 contents are undefined - */ -static bool ldm_validate_tocblocks(struct parsed_partitions *state, - unsigned long base, struct ldmdb *ldb) -{ - static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; - struct tocblock *tb[4]; - struct privhead *ph; - Sector sect; - u8 *data; - int i, nr_tbs; - bool result = false; - - BUG_ON(!state || !ldb); - ph = &ldb->ph; - tb[0] = &ldb->toc; - tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); - if (!tb[1]) { - ldm_crit("Out of memory."); - goto err; - } - tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); - tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); - /* - * Try to read and parse all four TOCBLOCKs. - * - * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so - * skip any that fail as long as we get at least one valid TOCBLOCK. - */ - for (nr_tbs = i = 0; i < 4; i++) { - data = read_part_sector(state, base + off[i], §); - if (!data) { - ldm_error("Disk read failed for TOCBLOCK %d.", i); - continue; - } - if (ldm_parse_tocblock(data, tb[nr_tbs])) - nr_tbs++; - put_dev_sector(sect); - } - if (!nr_tbs) { - ldm_crit("Failed to find a valid TOCBLOCK."); - goto err; - } - /* Range check the TOCBLOCK against a privhead. */ - if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || - ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > - ph->config_size)) { - ldm_crit("The bitmaps are out of range. Giving up."); - goto err; - } - /* Compare all loaded TOCBLOCKs. */ - for (i = 1; i < nr_tbs; i++) { - if (!ldm_compare_tocblocks(tb[0], tb[i])) { - ldm_crit("TOCBLOCKs 0 and %d do not match.", i); - goto err; - } - } - ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); - result = true; -err: - kfree(tb[1]); - return result; -} - -/** - * ldm_validate_vmdb - Read the VMDB and validate it - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @bdev, of the database - * @ldb: Cache of the database structures - * - * Find the vmdb of the LDM Database stored on @bdev and return the parsed - * information in @ldb. - * - * Return: 'true' @ldb contains validated VBDB info - * 'false' @ldb contents are undefined - */ -static bool ldm_validate_vmdb(struct parsed_partitions *state, - unsigned long base, struct ldmdb *ldb) -{ - Sector sect; - u8 *data; - bool result = false; - struct vmdb *vm; - struct tocblock *toc; - - BUG_ON (!state || !ldb); - - vm = &ldb->vm; - toc = &ldb->toc; - - data = read_part_sector(state, base + OFF_VMDB, §); - if (!data) { - ldm_crit ("Disk read failed."); - return false; - } - - if (!ldm_parse_vmdb (data, vm)) - goto out; /* Already logged */ - - /* Are there uncommitted transactions? */ - if (get_unaligned_be16(data + 0x10) != 0x01) { - ldm_crit ("Database is not in a consistent state. Aborting."); - goto out; - } - - if (vm->vblk_offset != 512) - ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); - - /* - * The last_vblkd_seq can be before the end of the vmdb, just make sure - * it is not out of bounds. - */ - if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { - ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " - "Database is corrupt. Aborting."); - goto out; - } - - result = true; -out: - put_dev_sector (sect); - return result; -} - - -/** - * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk - * @state: Partition check state including device holding the LDM Database - * - * This function provides a weak test to decide whether the device is a dynamic - * disk or not. It looks for an MS-DOS-style partition table containing at - * least one partition of type 0x42 (formerly SFS, now used by Windows for - * dynamic disks). - * - * N.B. The only possible error can come from the read_part_sector and that is - * only likely to happen if the underlying device is strange. If that IS - * the case we should return zero to let someone else try. - * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred - */ -static bool ldm_validate_partition_table(struct parsed_partitions *state) -{ - Sector sect; - u8 *data; - struct partition *p; - int i; - bool result = false; - - BUG_ON(!state); - - data = read_part_sector(state, 0, §); - if (!data) { - ldm_info ("Disk read failed."); - return false; - } - - if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) - goto out; - - p = (struct partition*)(data + 0x01BE); - for (i = 0; i < 4; i++, p++) - if (SYS_IND (p) == LDM_PARTITION) { - result = true; - break; - } - - if (result) - ldm_debug ("Found W2K dynamic disk partition type."); - -out: - put_dev_sector (sect); - return result; -} - -/** - * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id - * @ldb: Cache of the database structures - * - * The LDM Database contains a list of all partitions on all dynamic disks. - * The primary PRIVHEAD, at the beginning of the physical disk, tells us - * the GUID of this disk. This function searches for the GUID in a linked - * list of vblk's. - * - * Return: Pointer, A matching vblk was found - * NULL, No match, or an error - */ -static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) -{ - struct list_head *item; - - BUG_ON (!ldb); - - list_for_each (item, &ldb->v_disk) { - struct vblk *v = list_entry (item, struct vblk, list); - if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE)) - return v; - } - - return NULL; -} - -/** - * ldm_create_data_partitions - Create data partitions for this device - * @pp: List of the partitions parsed so far - * @ldb: Cache of the database structures - * - * The database contains ALL the partitions for ALL disk groups, so we need to - * filter out this specific disk. Using the disk's object id, we can find all - * the partitions in the database that belong to this disk. - * - * Add each partition in our database, to the parsed_partitions structure. - * - * N.B. This function creates the partitions in the order it finds partition - * objects in the linked list. - * - * Return: 'true' Partition created - * 'false' Error, probably a range checking problem - */ -static bool ldm_create_data_partitions (struct parsed_partitions *pp, - const struct ldmdb *ldb) -{ - struct list_head *item; - struct vblk *vb; - struct vblk *disk; - struct vblk_part *part; - int part_num = 1; - - BUG_ON (!pp || !ldb); - - disk = ldm_get_disk_objid (ldb); - if (!disk) { - ldm_crit ("Can't find the ID of this disk in the database."); - return false; - } - - strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); - - /* Create the data partitions */ - list_for_each (item, &ldb->v_part) { - vb = list_entry (item, struct vblk, list); - part = &vb->vblk.part; - - if (part->disk_id != disk->obj_id) - continue; - - put_partition (pp, part_num, ldb->ph.logical_disk_start + - part->start, part->size); - part_num++; - } - - strlcat(pp->pp_buf, "\n", PAGE_SIZE); - return true; -} - - -/** - * ldm_relative - Calculate the next relative offset - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @base: Size of the previous fixed width fields - * @offset: Cumulative size of the previous variable-width fields - * - * Because many of the VBLK fields are variable-width, it's necessary - * to calculate each offset based on the previous one and the length - * of the field it pointed to. - * - * Return: -1 Error, the calculated offset exceeded the size of the buffer - * n OK, a range-checked offset into buffer - */ -static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) -{ - - base += offset; - if (!buffer || offset < 0 || base > buflen) { - if (!buffer) - ldm_error("!buffer"); - if (offset < 0) - ldm_error("offset (%d) < 0", offset); - if (base > buflen) - ldm_error("base (%d) > buflen (%d)", base, buflen); - return -1; - } - if (base + buffer[base] >= buflen) { - ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, - buffer[base], buflen); - return -1; - } - return buffer[base] + offset + 1; -} - -/** - * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order - * @block: Pointer to the variable-width number to convert - * - * Large numbers in the LDM Database are often stored in a packed format. Each - * number is prefixed by a one byte width marker. All numbers in the database - * are stored in big-endian byte order. This function reads one of these - * numbers and returns the result - * - * N.B. This function DOES NOT perform any range checking, though the most - * it will read is eight bytes. - * - * Return: n A number - * 0 Zero, or an error occurred - */ -static u64 ldm_get_vnum (const u8 *block) -{ - u64 tmp = 0; - u8 length; - - BUG_ON (!block); - - length = *block++; - - if (length && length <= 8) - while (length--) - tmp = (tmp << 8) | *block++; - else - ldm_error ("Illegal length %d.", length); - - return tmp; -} - -/** - * ldm_get_vstr - Read a length-prefixed string into a buffer - * @block: Pointer to the length marker - * @buffer: Location to copy string to - * @buflen: Size of the output buffer - * - * Many of the strings in the LDM Database are not NULL terminated. Instead - * they are prefixed by a one byte length marker. This function copies one of - * these strings into a buffer. - * - * N.B. This function DOES NOT perform any range checking on the input. - * If the buffer is too small, the output will be truncated. - * - * Return: 0, Error and @buffer contents are undefined - * n, String length in characters (excluding NULL) - * buflen-1, String was truncated. - */ -static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) -{ - int length; - - BUG_ON (!block || !buffer); - - length = block[0]; - if (length >= buflen) { - ldm_error ("Truncating string %d -> %d.", length, buflen); - length = buflen - 1; - } - memcpy (buffer, block + 1, length); - buffer[length] = 0; - return length; -} - - -/** - * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Component object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Component VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; - struct vblk_comp *comp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); - r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); - r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); - - if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { - r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); - r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); - len = r_cols; - } else { - r_stripe = 0; - r_cols = 0; - len = r_parent; - } - if (len < 0) - return false; - - len += VBLK_SIZE_CMP3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - comp = &vb->vblk.comp; - ldm_get_vstr (buffer + 0x18 + r_name, comp->state, - sizeof (comp->state)); - comp->type = buffer[0x18 + r_vstate]; - comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); - comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); - comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; - - return true; -} - -/** - * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk Group object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Disk Group VBLK - * 'false' @vb contents are not defined - */ -static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_diskid, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); - - if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { - r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); - r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); - len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; - len = r_diskid; - } - if (len < 0) - return false; - - len += VBLK_SIZE_DGR3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - dgrp = &vb->vblk.dgrp; - ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, - sizeof (dgrp->disk_id)); - return true; -} - -/** - * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk Group object (version 4) into a vblk structure. - * - * Return: 'true' @vb contains a Disk Group VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) -{ - char buf[64]; - int r_objid, r_name, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - - if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { - r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); - r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); - len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; - len = r_name; - } - if (len < 0) - return false; - - len += VBLK_SIZE_DGR4; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - dgrp = &vb->vblk.dgrp; - - ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); - return true; -} - -/** - * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Disk VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_diskid, r_altname, len; - struct vblk_disk *disk; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); - r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); - len = r_altname; - if (len < 0) - return false; - - len += VBLK_SIZE_DSK3; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - disk = &vb->vblk.disk; - ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, - sizeof (disk->alt_name)); - if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) - return false; - - return true; -} - -/** - * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Disk object (version 4) into a vblk structure. - * - * Return: 'true' @vb contains a Disk VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, len; - struct vblk_disk *disk; - - BUG_ON (!buffer || !vb); - - r_objid = ldm_relative (buffer, buflen, 0x18, 0); - r_name = ldm_relative (buffer, buflen, 0x18, r_objid); - len = r_name; - if (len < 0) - return false; - - len += VBLK_SIZE_DSK4; - if (len != get_unaligned_be32(buffer + 0x14)) - return false; - - disk = &vb->vblk.disk; - memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE); - return true; -} - -/** - * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Partition object (version 3) into a vblk structure. - * - * Return: 'true' @vb contains a Partition VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; - struct vblk_part *part; - - BUG_ON(!buffer || !vb); - r_objid = ldm_relative(buffer, buflen, 0x18, 0); - if (r_objid < 0) { - ldm_error("r_objid %d < 0", r_objid); - return false; - } - r_name = ldm_relative(buffer, buflen, 0x18, r_objid); - if (r_name < 0) { - ldm_error("r_name %d < 0", r_name); - return false; - } - r_size = ldm_relative(buffer, buflen, 0x34, r_name); - if (r_size < 0) { - ldm_error("r_size %d < 0", r_size); - return false; - } - r_parent = ldm_relative(buffer, buflen, 0x34, r_size); - if (r_parent < 0) { - ldm_error("r_parent %d < 0", r_parent); - return false; - } - r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); - if (r_diskid < 0) { - ldm_error("r_diskid %d < 0", r_diskid); - return false; - } - if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { - r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); - if (r_index < 0) { - ldm_error("r_index %d < 0", r_index); - return false; - } - len = r_index; - } else { - r_index = 0; - len = r_diskid; - } - if (len < 0) { - ldm_error("len %d < 0", len); - return false; - } - len += VBLK_SIZE_PRT3; - if (len > get_unaligned_be32(buffer + 0x14)) { - ldm_error("len %d > BE32(buffer + 0x14) %d", len, - get_unaligned_be32(buffer + 0x14)); - return false; - } - part = &vb->vblk.part; - part->start = get_unaligned_be64(buffer + 0x24 + r_name); - part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); - part->size = ldm_get_vnum(buffer + 0x34 + r_name); - part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); - part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); - if (vb->flags & VBLK_FLAG_PART_INDEX) - part->partnum = buffer[0x35 + r_diskid]; - else - part->partnum = 0; - return true; -} - -/** - * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure - * @buffer: Block of data being worked on - * @buflen: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK Volume object (version 5) into a vblk structure. - * - * Return: 'true' @vb contains a Volume VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) -{ - int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; - int r_id1, r_id2, r_size2, r_drive, len; - struct vblk_volu *volu; - - BUG_ON(!buffer || !vb); - r_objid = ldm_relative(buffer, buflen, 0x18, 0); - if (r_objid < 0) { - ldm_error("r_objid %d < 0", r_objid); - return false; - } - r_name = ldm_relative(buffer, buflen, 0x18, r_objid); - if (r_name < 0) { - ldm_error("r_name %d < 0", r_name); - return false; - } - r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); - if (r_vtype < 0) { - ldm_error("r_vtype %d < 0", r_vtype); - return false; - } - r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); - if (r_disable_drive_letter < 0) { - ldm_error("r_disable_drive_letter %d < 0", - r_disable_drive_letter); - return false; - } - r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); - if (r_child < 0) { - ldm_error("r_child %d < 0", r_child); - return false; - } - r_size = ldm_relative(buffer, buflen, 0x3D, r_child); - if (r_size < 0) { - ldm_error("r_size %d < 0", r_size); - return false; - } - if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { - r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); - if (r_id1 < 0) { - ldm_error("r_id1 %d < 0", r_id1); - return false; - } - } else - r_id1 = r_size; - if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { - r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); - if (r_id2 < 0) { - ldm_error("r_id2 %d < 0", r_id2); - return false; - } - } else - r_id2 = r_id1; - if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { - r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); - if (r_size2 < 0) { - ldm_error("r_size2 %d < 0", r_size2); - return false; - } - } else - r_size2 = r_id2; - if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { - r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); - if (r_drive < 0) { - ldm_error("r_drive %d < 0", r_drive); - return false; - } - } else - r_drive = r_size2; - len = r_drive; - if (len < 0) { - ldm_error("len %d < 0", len); - return false; - } - len += VBLK_SIZE_VOL5; - if (len > get_unaligned_be32(buffer + 0x14)) { - ldm_error("len %d > BE32(buffer + 0x14) %d", len, - get_unaligned_be32(buffer + 0x14)); - return false; - } - volu = &vb->vblk.volu; - ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, - sizeof(volu->volume_type)); - memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, - sizeof(volu->volume_state)); - volu->size = ldm_get_vnum(buffer + 0x3D + r_child); - volu->partition_type = buffer[0x41 + r_size]; - memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); - if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { - ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, - sizeof(volu->drive_hint)); - } - return true; -} - -/** - * ldm_parse_vblk - Read a raw VBLK object into a vblk structure - * @buf: Block of data being worked on - * @len: Size of the block of data - * @vb: In-memory vblk in which to return information - * - * Read a raw VBLK object into a vblk structure. This function just reads the - * information common to all VBLK types, then delegates the rest of the work to - * helper functions: ldm_parse_*. - * - * Return: 'true' @vb contains a VBLK - * 'false' @vb contents are not defined - */ -static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) -{ - bool result = false; - int r_objid; - - BUG_ON (!buf || !vb); - - r_objid = ldm_relative (buf, len, 0x18, 0); - if (r_objid < 0) { - ldm_error ("VBLK header is corrupt."); - return false; - } - - vb->flags = buf[0x12]; - vb->type = buf[0x13]; - vb->obj_id = ldm_get_vnum (buf + 0x18); - ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); - - switch (vb->type) { - case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; - case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; - case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; - case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; - case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; - case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; - case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; - } - - if (result) - ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", - (unsigned long long) vb->obj_id, vb->type); - else - ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", - (unsigned long long) vb->obj_id, vb->type); - - return result; -} - - -/** - * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database - * @data: Raw VBLK to add to the database - * @len: Size of the raw VBLK - * @ldb: Cache of the database structures - * - * The VBLKs are sorted into categories. Partitions are also sorted by offset. - * - * N.B. This function does not check the validity of the VBLKs. - * - * Return: 'true' The VBLK was added - * 'false' An error occurred - */ -static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) -{ - struct vblk *vb; - struct list_head *item; - - BUG_ON (!data || !ldb); - - vb = kmalloc (sizeof (*vb), GFP_KERNEL); - if (!vb) { - ldm_crit ("Out of memory."); - return false; - } - - if (!ldm_parse_vblk (data, len, vb)) { - kfree(vb); - return false; /* Already logged */ - } - - /* Put vblk into the correct list. */ - switch (vb->type) { - case VBLK_DGR3: - case VBLK_DGR4: - list_add (&vb->list, &ldb->v_dgrp); - break; - case VBLK_DSK3: - case VBLK_DSK4: - list_add (&vb->list, &ldb->v_disk); - break; - case VBLK_VOL5: - list_add (&vb->list, &ldb->v_volu); - break; - case VBLK_CMP3: - list_add (&vb->list, &ldb->v_comp); - break; - case VBLK_PRT3: - /* Sort by the partition's start sector. */ - list_for_each (item, &ldb->v_part) { - struct vblk *v = list_entry (item, struct vblk, list); - if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && - (v->vblk.part.start > vb->vblk.part.start)) { - list_add_tail (&vb->list, &v->list); - return true; - } - } - list_add_tail (&vb->list, &ldb->v_part); - break; - } - return true; -} - -/** - * ldm_frag_add - Add a VBLK fragment to a list - * @data: Raw fragment to be added to the list - * @size: Size of the raw fragment - * @frags: Linked list of VBLK fragments - * - * Fragmented VBLKs may not be consecutive in the database, so they are placed - * in a list so they can be pieced together later. - * - * Return: 'true' Success, the VBLK was added to the list - * 'false' Error, a problem occurred - */ -static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) -{ - struct frag *f; - struct list_head *item; - int rec, num, group; - - BUG_ON (!data || !frags); - - if (size < 2 * VBLK_SIZE_HEAD) { - ldm_error("Value of size is to small."); - return false; - } - - group = get_unaligned_be32(data + 0x08); - rec = get_unaligned_be16(data + 0x0C); - num = get_unaligned_be16(data + 0x0E); - if ((num < 1) || (num > 4)) { - ldm_error ("A VBLK claims to have %d parts.", num); - return false; - } - if (rec >= num) { - ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); - return false; - } - - list_for_each (item, frags) { - f = list_entry (item, struct frag, list); - if (f->group == group) - goto found; - } - - f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); - if (!f) { - ldm_crit ("Out of memory."); - return false; - } - - f->group = group; - f->num = num; - f->rec = rec; - f->map = 0xFF << num; - - list_add_tail (&f->list, frags); -found: - if (rec >= f->num) { - ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); - return false; - } - - if (f->map & (1 << rec)) { - ldm_error ("Duplicate VBLK, part %d.", rec); - f->map &= 0x7F; /* Mark the group as broken */ - return false; - } - - f->map |= (1 << rec); - - data += VBLK_SIZE_HEAD; - size -= VBLK_SIZE_HEAD; - - memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); - - return true; -} - -/** - * ldm_frag_free - Free a linked list of VBLK fragments - * @list: Linked list of fragments - * - * Free a linked list of VBLK fragments - * - * Return: none - */ -static void ldm_frag_free (struct list_head *list) -{ - struct list_head *item, *tmp; - - BUG_ON (!list); - - list_for_each_safe (item, tmp, list) - kfree (list_entry (item, struct frag, list)); -} - -/** - * ldm_frag_commit - Validate fragmented VBLKs and add them to the database - * @frags: Linked list of VBLK fragments - * @ldb: Cache of the database structures - * - * Now that all the fragmented VBLKs have been collected, they must be added to - * the database for later use. - * - * Return: 'true' All the fragments we added successfully - * 'false' One or more of the fragments we invalid - */ -static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) -{ - struct frag *f; - struct list_head *item; - - BUG_ON (!frags || !ldb); - - list_for_each (item, frags) { - f = list_entry (item, struct frag, list); - - if (f->map != 0xFF) { - ldm_error ("VBLK group %d is incomplete (0x%02x).", - f->group, f->map); - return false; - } - - if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) - return false; /* Already logged */ - } - return true; -} - -/** - * ldm_get_vblks - Read the on-disk database of VBLKs into memory - * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database - * @ldb: Cache of the database structures - * - * To use the information from the VBLKs, they need to be read from the disk, - * unpacked and validated. We cache them in @ldb according to their type. - * - * Return: 'true' All the VBLKs were read successfully - * 'false' An error occurred - */ -static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, - struct ldmdb *ldb) -{ - int size, perbuf, skip, finish, s, v, recs; - u8 *data = NULL; - Sector sect; - bool result = false; - LIST_HEAD (frags); - - BUG_ON(!state || !ldb); - - size = ldb->vm.vblk_size; - perbuf = 512 / size; - skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ - finish = (size * ldb->vm.last_vblk_seq) >> 9; - - for (s = skip; s < finish; s++) { /* For each sector */ - data = read_part_sector(state, base + OFF_VMDB + s, §); - if (!data) { - ldm_crit ("Disk read failed."); - goto out; - } - - for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ - if (MAGIC_VBLK != get_unaligned_be32(data)) { - ldm_error ("Expected to find a VBLK."); - goto out; - } - - recs = get_unaligned_be16(data + 0x0E); /* Number of records */ - if (recs == 1) { - if (!ldm_ldmdb_add (data, size, ldb)) - goto out; /* Already logged */ - } else if (recs > 1) { - if (!ldm_frag_add (data, size, &frags)) - goto out; /* Already logged */ - } - /* else Record is not in use, ignore it. */ - } - put_dev_sector (sect); - data = NULL; - } - - result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ -out: - if (data) - put_dev_sector (sect); - ldm_frag_free (&frags); - - return result; -} - -/** - * ldm_free_vblks - Free a linked list of vblk's - * @lh: Head of a linked list of struct vblk - * - * Free a list of vblk's and free the memory used to maintain the list. - * - * Return: none - */ -static void ldm_free_vblks (struct list_head *lh) -{ - struct list_head *item, *tmp; - - BUG_ON (!lh); - - list_for_each_safe (item, tmp, lh) - kfree (list_entry (item, struct vblk, list)); -} - - -/** - * ldm_partition - Find out whether a device is a dynamic disk and handle it - * @state: Partition check state including device holding the LDM Database - * - * This determines whether the device @bdev is a dynamic disk and if so creates - * the partitions necessary in the gendisk structure pointed to by @hd. - * - * We create a dummy device 1, which contains the LDM database, and then create - * each partition described by the LDM database in sequence as devices 2+. For - * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, - * and so on: the actual data containing partitions. - * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk - * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted - */ -int ldm_partition(struct parsed_partitions *state) -{ - struct ldmdb *ldb; - unsigned long base; - int result = -1; - - BUG_ON(!state); - - /* Look for signs of a Dynamic Disk */ - if (!ldm_validate_partition_table(state)) - return 0; - - ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); - if (!ldb) { - ldm_crit ("Out of memory."); - goto out; - } - - /* Parse and check privheads. */ - if (!ldm_validate_privheads(state, &ldb->ph)) - goto out; /* Already logged */ - - /* All further references are relative to base (database start). */ - base = ldb->ph.config_start; - - /* Parse and check tocs and vmdb. */ - if (!ldm_validate_tocblocks(state, base, ldb) || - !ldm_validate_vmdb(state, base, ldb)) - goto out; /* Already logged */ - - /* Initialize vblk lists in ldmdb struct */ - INIT_LIST_HEAD (&ldb->v_dgrp); - INIT_LIST_HEAD (&ldb->v_disk); - INIT_LIST_HEAD (&ldb->v_volu); - INIT_LIST_HEAD (&ldb->v_comp); - INIT_LIST_HEAD (&ldb->v_part); - - if (!ldm_get_vblks(state, base, ldb)) { - ldm_crit ("Failed to read the VBLKs from the database."); - goto cleanup; - } - - /* Finally, create the data partition devices. */ - if (ldm_create_data_partitions(state, ldb)) { - ldm_debug ("Parsed LDM database successfully."); - result = 1; - } - /* else Already logged */ - -cleanup: - ldm_free_vblks (&ldb->v_dgrp); - ldm_free_vblks (&ldb->v_disk); - ldm_free_vblks (&ldb->v_volu); - ldm_free_vblks (&ldb->v_comp); - ldm_free_vblks (&ldb->v_part); -out: - kfree (ldb); - return result; -} diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h deleted file mode 100644 index 374242c0971..00000000000 --- a/fs/partitions/ldm.h +++ /dev/null @@ -1,215 +0,0 @@ -/** - * ldm - Part of the Linux-NTFS project. - * - * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> - * Copyright (c) 2001-2007 Anton Altaparmakov - * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> - * - * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program (in the main directory of the Linux-NTFS source - * in the file COPYING); if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _FS_PT_LDM_H_ -#define _FS_PT_LDM_H_ - -#include <linux/types.h> -#include <linux/list.h> -#include <linux/genhd.h> -#include <linux/fs.h> -#include <asm/unaligned.h> -#include <asm/byteorder.h> - -struct parsed_partitions; - -/* Magic numbers in CPU format. */ -#define MAGIC_VMDB 0x564D4442 /* VMDB */ -#define MAGIC_VBLK 0x56424C4B /* VBLK */ -#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ -#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ - -/* The defined vblk types. */ -#define VBLK_VOL5 0x51 /* Volume, version 5 */ -#define VBLK_CMP3 0x32 /* Component, version 3 */ -#define VBLK_PRT3 0x33 /* Partition, version 3 */ -#define VBLK_DSK3 0x34 /* Disk, version 3 */ -#define VBLK_DSK4 0x44 /* Disk, version 4 */ -#define VBLK_DGR3 0x35 /* Disk Group, version 3 */ -#define VBLK_DGR4 0x45 /* Disk Group, version 4 */ - -/* vblk flags indicating extra information will be present */ -#define VBLK_FLAG_COMP_STRIPE 0x10 -#define VBLK_FLAG_PART_INDEX 0x08 -#define VBLK_FLAG_DGR3_IDS 0x08 -#define VBLK_FLAG_DGR4_IDS 0x08 -#define VBLK_FLAG_VOLU_ID1 0x08 -#define VBLK_FLAG_VOLU_ID2 0x20 -#define VBLK_FLAG_VOLU_SIZE 0x80 -#define VBLK_FLAG_VOLU_DRIVE 0x02 - -/* size of a vblk's static parts */ -#define VBLK_SIZE_HEAD 16 -#define VBLK_SIZE_CMP3 22 /* Name and version */ -#define VBLK_SIZE_DGR3 12 -#define VBLK_SIZE_DGR4 44 -#define VBLK_SIZE_DSK3 12 -#define VBLK_SIZE_DSK4 45 -#define VBLK_SIZE_PRT3 28 -#define VBLK_SIZE_VOL5 58 - -/* component types */ -#define COMP_STRIPE 0x01 /* Stripe-set */ -#define COMP_BASIC 0x02 /* Basic disk */ -#define COMP_RAID 0x03 /* Raid-set */ - -/* Other constants. */ -#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ - -#define OFF_PRIV1 6 /* Offset of the first privhead - relative to the start of the - device in sectors */ - -/* Offsets to structures within the LDM Database in sectors. */ -#define OFF_PRIV2 1856 /* Backup private headers. */ -#define OFF_PRIV3 2047 - -#define OFF_TOCB1 1 /* Tables of contents. */ -#define OFF_TOCB2 2 -#define OFF_TOCB3 2045 -#define OFF_TOCB4 2046 - -#define OFF_VMDB 17 /* List of partitions. */ - -#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ - -#define TOC_BITMAP1 "config" /* Names of the two defined */ -#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ - -/* Borrowed from msdos.c */ -#define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) - -struct frag { /* VBLK Fragment handling */ - struct list_head list; - u32 group; - u8 num; /* Total number of records */ - u8 rec; /* This is record number n */ - u8 map; /* Which portions are in use */ - u8 data[0]; -}; - -/* In memory LDM database structures. */ - -#define GUID_SIZE 16 - -struct privhead { /* Offsets and sizes are in sectors. */ - u16 ver_major; - u16 ver_minor; - u64 logical_disk_start; - u64 logical_disk_size; - u64 config_start; - u64 config_size; - u8 disk_id[GUID_SIZE]; -}; - -struct tocblock { /* We have exactly two bitmaps. */ - u8 bitmap1_name[16]; - u64 bitmap1_start; - u64 bitmap1_size; - u8 bitmap2_name[16]; - u64 bitmap2_start; - u64 bitmap2_size; -}; - -struct vmdb { /* VMDB: The database header */ - u16 ver_major; - u16 ver_minor; - u32 vblk_size; - u32 vblk_offset; - u32 last_vblk_seq; -}; - -struct vblk_comp { /* VBLK Component */ - u8 state[16]; - u64 parent_id; - u8 type; - u8 children; - u16 chunksize; -}; - -struct vblk_dgrp { /* VBLK Disk Group */ - u8 disk_id[64]; -}; - -struct vblk_disk { /* VBLK Disk */ - u8 disk_id[GUID_SIZE]; - u8 alt_name[128]; -}; - -struct vblk_part { /* VBLK Partition */ - u64 start; - u64 size; /* start, size and vol_off in sectors */ - u64 volume_offset; - u64 parent_id; - u64 disk_id; - u8 partnum; -}; - -struct vblk_volu { /* VBLK Volume */ - u8 volume_type[16]; - u8 volume_state[16]; - u8 guid[16]; - u8 drive_hint[4]; - u64 size; - u8 partition_type; -}; - -struct vblk_head { /* VBLK standard header */ - u32 group; - u16 rec; - u16 nrec; -}; - -struct vblk { /* Generalised VBLK */ - u8 name[64]; - u64 obj_id; - u32 sequence; - u8 flags; - u8 type; - union { - struct vblk_comp comp; - struct vblk_dgrp dgrp; - struct vblk_disk disk; - struct vblk_part part; - struct vblk_volu volu; - } vblk; - struct list_head list; -}; - -struct ldmdb { /* Cache of the database */ - struct privhead ph; - struct tocblock toc; - struct vmdb vm; - struct list_head v_dgrp; - struct list_head v_disk; - struct list_head v_volu; - struct list_head v_comp; - struct list_head v_part; -}; - -int ldm_partition(struct parsed_partitions *state); - -#endif /* _FS_PT_LDM_H_ */ - diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c deleted file mode 100644 index 11f688bd76c..00000000000 --- a/fs/partitions/mac.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * fs/partitions/mac.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include <linux/ctype.h> -#include "check.h" -#include "mac.h" - -#ifdef CONFIG_PPC_PMAC -#include <asm/machdep.h> -extern void note_bootable_part(dev_t dev, int part, int goodness); -#endif - -/* - * Code to understand MacOS partition tables. - */ - -static inline void mac_fix_string(char *stg, int len) -{ - int i; - - for (i = len - 1; i >= 0 && stg[i] == ' '; i--) - stg[i] = 0; -} - -int mac_partition(struct parsed_partitions *state) -{ - Sector sect; - unsigned char *data; - int slot, blocks_in_map; - unsigned secsize; -#ifdef CONFIG_PPC_PMAC - int found_root = 0; - int found_root_goodness = 0; -#endif - struct mac_partition *part; - struct mac_driver_desc *md; - - /* Get 0th block and look at the first partition map entry. */ - md = read_part_sector(state, 0, §); - if (!md) - return -1; - if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { - put_dev_sector(sect); - return 0; - } - secsize = be16_to_cpu(md->block_size); - put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); - if (!data) - return -1; - part = (struct mac_partition *) (data + secsize%512); - if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { - put_dev_sector(sect); - return 0; /* not a MacOS disk */ - } - blocks_in_map = be32_to_cpu(part->map_count); - if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { - put_dev_sector(sect); - return 0; - } - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); - for (slot = 1; slot <= blocks_in_map; ++slot) { - int pos = slot * secsize; - put_dev_sector(sect); - data = read_part_sector(state, pos/512, §); - if (!data) - return -1; - part = (struct mac_partition *) (data + pos%512); - if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) - break; - put_partition(state, slot, - be32_to_cpu(part->start_block) * (secsize/512), - be32_to_cpu(part->block_count) * (secsize/512)); - - if (!strnicmp(part->type, "Linux_RAID", 10)) - state->parts[slot].flags = ADDPART_FLAG_RAID; -#ifdef CONFIG_PPC_PMAC - /* - * If this is the first bootable partition, tell the - * setup code, in case it wants to make this the root. - */ - if (machine_is(powermac)) { - int goodness = 0; - - mac_fix_string(part->processor, 16); - mac_fix_string(part->name, 32); - mac_fix_string(part->type, 32); - - if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) - && strcasecmp(part->processor, "powerpc") == 0) - goodness++; - - if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 - || (strnicmp(part->type, "Linux", 5) == 0 - && strcasecmp(part->type, "Linux_swap") != 0)) { - int i, l; - - goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) - goodness++; - for (i = 0; i <= l - 4; ++i) { - if (strnicmp(part->name + i, "root", - 4) == 0) { - goodness += 2; - break; - } - } - if (strnicmp(part->name, "swap", 4) == 0) - goodness--; - } - - if (goodness > found_root_goodness) { - found_root = slot; - found_root_goodness = goodness; - } - } -#endif /* CONFIG_PPC_PMAC */ - } -#ifdef CONFIG_PPC_PMAC - if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, - found_root_goodness); -#endif - - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; -} diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h deleted file mode 100644 index 3c7d9843638..00000000000 --- a/fs/partitions/mac.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * fs/partitions/mac.h - */ - -#define MAC_PARTITION_MAGIC 0x504d - -/* type field value for A/UX or other Unix partitions */ -#define APPLE_AUX_TYPE "Apple_UNIX_SVR2" - -struct mac_partition { - __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ - __be16 res1; - __be32 map_count; /* # blocks in partition map */ - __be32 start_block; /* absolute starting block # of partition */ - __be32 block_count; /* number of blocks in partition */ - char name[32]; /* partition name */ - char type[32]; /* string type description */ - __be32 data_start; /* rel block # of first data block */ - __be32 data_count; /* number of data blocks */ - __be32 status; /* partition status bits */ - __be32 boot_start; - __be32 boot_size; - __be32 boot_load; - __be32 boot_load2; - __be32 boot_entry; - __be32 boot_entry2; - __be32 boot_cksum; - char processor[16]; /* identifies ISA of boot */ - /* there is more stuff after this that we don't need */ -}; - -#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ - -#define MAC_DRIVER_MAGIC 0x4552 - -/* Driver descriptor structure, in block 0 */ -struct mac_driver_desc { - __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ - __be16 block_size; - __be32 block_count; - /* ... more stuff */ -}; - -int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c deleted file mode 100644 index 5f79a6677c6..00000000000 --- a/fs/partitions/msdos.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * fs/partitions/msdos.c - * - * Code extracted from drivers/block/genhd.c - * Copyright (C) 1991-1998 Linus Torvalds - * - * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug - * in the early extended-partition checks and added DM partitions - * - * Support for DiskManager v6.0x added by Mark Lord, - * with information provided by OnTrack. This now works for linux fdisk - * and LILO, as well as loadlin and bootln. Note that disks other than - * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). - * - * More flexible handling of extended partitions - aeb, 950831 - * - * Check partition table on IDE disks for common CHS translations - * - * Re-organised Feb 1998 Russell King - */ -#include <linux/msdos_fs.h> - -#include "check.h" -#include "msdos.h" -#include "efi.h" - -/* - * Many architectures don't like unaligned accesses, while - * the nr_sects and start_sect partition table entries are - * at a 2 (mod 4) address. - */ -#include <asm/unaligned.h> - -#define SYS_IND(p) get_unaligned(&p->sys_ind) - -static inline sector_t nr_sects(struct partition *p) -{ - return (sector_t)get_unaligned_le32(&p->nr_sects); -} - -static inline sector_t start_sect(struct partition *p) -{ - return (sector_t)get_unaligned_le32(&p->start_sect); -} - -static inline int is_extended_partition(struct partition *p) -{ - return (SYS_IND(p) == DOS_EXTENDED_PARTITION || - SYS_IND(p) == WIN98_EXTENDED_PARTITION || - SYS_IND(p) == LINUX_EXTENDED_PARTITION); -} - -#define MSDOS_LABEL_MAGIC1 0x55 -#define MSDOS_LABEL_MAGIC2 0xAA - -static inline int -msdos_magic_present(unsigned char *p) -{ - return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); -} - -/* Value is EBCDIC 'IBMA' */ -#define AIX_LABEL_MAGIC1 0xC9 -#define AIX_LABEL_MAGIC2 0xC2 -#define AIX_LABEL_MAGIC3 0xD4 -#define AIX_LABEL_MAGIC4 0xC1 -static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) -{ - struct partition *pt = (struct partition *) (p + 0x1be); - Sector sect; - unsigned char *d; - int slot, ret = 0; - - if (!(p[0] == AIX_LABEL_MAGIC1 && - p[1] == AIX_LABEL_MAGIC2 && - p[2] == AIX_LABEL_MAGIC3 && - p[3] == AIX_LABEL_MAGIC4)) - return 0; - /* Assume the partition table is valid if Linux partitions exists */ - for (slot = 1; slot <= 4; slot++, pt++) { - if (pt->sys_ind == LINUX_SWAP_PARTITION || - pt->sys_ind == LINUX_RAID_PARTITION || - pt->sys_ind == LINUX_DATA_PARTITION || - pt->sys_ind == LINUX_LVM_PARTITION || - is_extended_partition(pt)) - return 0; - } - d = read_part_sector(state, 7, §); - if (d) { - if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') - ret = 1; - put_dev_sector(sect); - }; - return ret; -} - -/* - * Create devices for each logical partition in an extended partition. - * The logical partitions form a linked list, with each entry being - * a partition table with two entries. The first entry - * is the real data partition (with a start relative to the partition - * table start). The second is a pointer to the next logical partition - * (with a start relative to the entire extended partition). - * We do not create a Linux partition for the partition tables, but - * only for the actual data partitions. - */ - -static void parse_extended(struct parsed_partitions *state, - sector_t first_sector, sector_t first_size) -{ - struct partition *p; - Sector sect; - unsigned char *data; - sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; - int loopct = 0; /* number of links followed - without finding a data partition */ - int i; - - this_sector = first_sector; - this_size = first_size; - - while (1) { - if (++loopct > 100) - return; - if (state->next == state->limit) - return; - data = read_part_sector(state, this_sector, §); - if (!data) - return; - - if (!msdos_magic_present(data + 510)) - goto done; - - p = (struct partition *) (data + 0x1be); - - /* - * Usually, the first entry is the real data partition, - * the 2nd entry is the next extended partition, or empty, - * and the 3rd and 4th entries are unused. - * However, DRDOS sometimes has the extended partition as - * the first entry (when the data partition is empty), - * and OS/2 seems to use all four entries. - */ - - /* - * First process the data partition(s) - */ - for (i=0; i<4; i++, p++) { - sector_t offs, size, next; - if (!nr_sects(p) || is_extended_partition(p)) - continue; - - /* Check the 3rd and 4th entries - - these sometimes contain random garbage */ - offs = start_sect(p)*sector_size; - size = nr_sects(p)*sector_size; - next = this_sector + offs; - if (i >= 2) { - if (offs + size > this_size) - continue; - if (next < first_sector) - continue; - if (next + size > first_sector + first_size) - continue; - } - - put_partition(state, state->next, next, size); - if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[state->next].flags = ADDPART_FLAG_RAID; - loopct = 0; - if (++state->next == state->limit) - goto done; - } - /* - * Next, process the (first) extended partition, if present. - * (So far, there seems to be no reason to make - * parse_extended() recursive and allow a tree - * of extended partitions.) - * It should be a link to the next logical partition. - */ - p -= 4; - for (i=0; i<4; i++, p++) - if (nr_sects(p) && is_extended_partition(p)) - break; - if (i == 4) - goto done; /* nothing left to do */ - - this_sector = first_sector + start_sect(p) * sector_size; - this_size = nr_sects(p) * sector_size; - put_dev_sector(sect); - } -done: - put_dev_sector(sect); -} - -/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also - indicates linux swap. Be careful before believing this is Solaris. */ - -static void parse_solaris_x86(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_SOLARIS_X86_PARTITION - Sector sect; - struct solaris_x86_vtoc *v; - int i; - short max_nparts; - - v = read_part_sector(state, offset + 1, §); - if (!v) - return; - if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { - put_dev_sector(sect); - return; - } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - if (le32_to_cpu(v->v_version) != 1) { - char tmp[64]; - - snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", - le32_to_cpu(v->v_version)); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - put_dev_sector(sect); - return; - } - /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ - max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; - for (i=0; i<max_nparts && state->next<state->limit; i++) { - struct solaris_x86_slice *s = &v->v_slice[i]; - char tmp[3 + 10 + 1 + 1]; - - if (s->s_size == 0) - continue; - snprintf(tmp, sizeof(tmp), " [s%d]", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - /* solaris partitions are relative to current MS-DOS - * one; must add the offset of the current partition */ - put_partition(state, state->next++, - le32_to_cpu(s->s_start)+offset, - le32_to_cpu(s->s_size)); - } - put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -#endif -} - -#if defined(CONFIG_BSD_DISKLABEL) -/* - * Create devices for BSD partitions listed in a disklabel, under a - * dos-like partition. See parse_extended() for more information. - */ -static void parse_bsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin, char *flavour, - int max_partitions) -{ - Sector sect; - struct bsd_disklabel *l; - struct bsd_partition *p; - char tmp[64]; - - l = read_part_sector(state, offset + 1, §); - if (!l) - return; - if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { - put_dev_sector(sect); - return; - } - - snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - - if (le16_to_cpu(l->d_npartitions) < max_partitions) - max_partitions = le16_to_cpu(l->d_npartitions); - for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { - sector_t bsd_start, bsd_size; - - if (state->next == state->limit) - break; - if (p->p_fstype == BSD_FS_UNUSED) - continue; - bsd_start = le32_to_cpu(p->p_offset); - bsd_size = le32_to_cpu(p->p_size); - if (offset == bsd_start && size == bsd_size) - /* full parent partition, we have it already */ - continue; - if (offset > bsd_start || offset+size < bsd_start+bsd_size) { - strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); - continue; - } - put_partition(state, state->next++, bsd_start, bsd_size); - } - put_dev_sector(sect); - if (le16_to_cpu(l->d_npartitions) > max_partitions) { - snprintf(tmp, sizeof(tmp), " (ignored %d more)", - le16_to_cpu(l->d_npartitions) - max_partitions); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -} -#endif - -static void parse_freebsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); -#endif -} - -static void parse_netbsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); -#endif -} - -static void parse_openbsd(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, offset, size, origin, "openbsd", - OPENBSD_MAXPARTITIONS); -#endif -} - -/* - * Create devices for Unixware partitions listed in a disklabel, under a - * dos-like partition. See parse_extended() for more information. - */ -static void parse_unixware(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_UNIXWARE_DISKLABEL - Sector sect; - struct unixware_disklabel *l; - struct unixware_slice *p; - - l = read_part_sector(state, offset + 29, §); - if (!l) - return; - if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || - le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { - put_dev_sector(sect); - return; - } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - p = &l->vtoc.v_slice[1]; - /* I omit the 0th slice as it is the same as whole disk. */ - while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { - if (state->next == state->limit) - break; - - if (p->s_label != UNIXWARE_FS_UNUSED) - put_partition(state, state->next++, - le32_to_cpu(p->start_sect), - le32_to_cpu(p->nr_sects)); - p++; - } - put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); -#endif -} - -/* - * Minix 2.0.0/2.0.2 subpartition support. - * Anand Krishnamurthy <anandk@wiproge.med.ge.com> - * Rajeev V. Pillai <rajeevvp@yahoo.com> - */ -static void parse_minix(struct parsed_partitions *state, - sector_t offset, sector_t size, int origin) -{ -#ifdef CONFIG_MINIX_SUBPARTITION - Sector sect; - unsigned char *data; - struct partition *p; - int i; - - data = read_part_sector(state, offset, §); - if (!data) - return; - - p = (struct partition *)(data + 0x1be); - - /* The first sector of a Minix partition can have either - * a secondary MBR describing its subpartitions, or - * the normal boot sector. */ - if (msdos_magic_present (data + 510) && - SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ - char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { - if (state->next == state->limit) - break; - /* add each partition in use */ - if (SYS_IND(p) == MINIX_PARTITION) - put_partition(state, state->next++, - start_sect(p), nr_sects(p)); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); - } - put_dev_sector(sect); -#endif /* CONFIG_MINIX_SUBPARTITION */ -} - -static struct { - unsigned char id; - void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); -} subtypes[] = { - {FREEBSD_PARTITION, parse_freebsd}, - {NETBSD_PARTITION, parse_netbsd}, - {OPENBSD_PARTITION, parse_openbsd}, - {MINIX_PARTITION, parse_minix}, - {UNIXWARE_PARTITION, parse_unixware}, - {SOLARIS_X86_PARTITION, parse_solaris_x86}, - {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, - {0, NULL}, -}; - -int msdos_partition(struct parsed_partitions *state) -{ - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; - Sector sect; - unsigned char *data; - struct partition *p; - struct fat_boot_sector *fb; - int slot; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - if (!msdos_magic_present(data + 510)) { - put_dev_sector(sect); - return 0; - } - - if (aix_magic_present(state, data)) { - put_dev_sector(sect); - strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); - return 0; - } - - /* - * Now that the 55aa signature is present, this is probably - * either the boot sector of a FAT filesystem or a DOS-type - * partition table. Reject this in case the boot indicator - * is not 0 or 0x80. - */ - p = (struct partition *) (data + 0x1be); - for (slot = 1; slot <= 4; slot++, p++) { - if (p->boot_ind != 0 && p->boot_ind != 0x80) { - /* - * Even without a valid boot inidicator value - * its still possible this is valid FAT filesystem - * without a partition table. - */ - fb = (struct fat_boot_sector *) data; - if (slot == 1 && fb->reserved && fb->fats - && fat_valid_media(fb->media)) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; - } else { - put_dev_sector(sect); - return 0; - } - } - } - -#ifdef CONFIG_EFI_PARTITION - p = (struct partition *) (data + 0x1be); - for (slot = 1 ; slot <= 4 ; slot++, p++) { - /* If this is an EFI GPT disk, msdos should ignore it. */ - if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { - put_dev_sector(sect); - return 0; - } - } -#endif - p = (struct partition *) (data + 0x1be); - - /* - * Look for partitions in two passes: - * First find the primary and DOS-type extended partitions. - * On the second pass look inside *BSD, Unixware and Solaris partitions. - */ - - state->next = 5; - for (slot = 1 ; slot <= 4 ; slot++, p++) { - sector_t start = start_sect(p)*sector_size; - sector_t size = nr_sects(p)*sector_size; - if (!size) - continue; - if (is_extended_partition(p)) { - /* - * prevent someone doing mkfs or mkswap on an - * extended partition, but leave room for LILO - * FIXME: this uses one logical sector for > 512b - * sector, although it may not be enough/proper. - */ - sector_t n = 2; - n = min(size, max(sector_size, n)); - put_partition(state, slot, start, n); - - strlcat(state->pp_buf, " <", PAGE_SIZE); - parse_extended(state, start, size); - strlcat(state->pp_buf, " >", PAGE_SIZE); - continue; - } - put_partition(state, slot, start, size); - if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[slot].flags = ADDPART_FLAG_RAID; - if (SYS_IND(p) == DM6_PARTITION) - strlcat(state->pp_buf, "[DM]", PAGE_SIZE); - if (SYS_IND(p) == EZD_PARTITION) - strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); - } - - strlcat(state->pp_buf, "\n", PAGE_SIZE); - - /* second pass - output for each on a separate line */ - p = (struct partition *) (0x1be + data); - for (slot = 1 ; slot <= 4 ; slot++, p++) { - unsigned char id = SYS_IND(p); - int n; - - if (!nr_sects(p)) - continue; - - for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) - ; - - if (!subtypes[n].parse) - continue; - subtypes[n].parse(state, start_sect(p) * sector_size, - nr_sects(p) * sector_size, slot); - } - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h deleted file mode 100644 index 38c781c490b..00000000000 --- a/fs/partitions/msdos.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/msdos.h - */ - -#define MSDOS_LABEL_MAGIC 0xAA55 - -int msdos_partition(struct parsed_partitions *state); - diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c deleted file mode 100644 index 764b86a0196..00000000000 --- a/fs/partitions/osf.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * fs/partitions/osf.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include "check.h" -#include "osf.h" - -#define MAX_OSF_PARTITIONS 18 - -int osf_partition(struct parsed_partitions *state) -{ - int i; - int slot = 1; - unsigned int npartitions; - Sector sect; - unsigned char *data; - struct disklabel { - __le32 d_magic; - __le16 d_type,d_subtype; - u8 d_typename[16]; - u8 d_packname[16]; - __le32 d_secsize; - __le32 d_nsectors; - __le32 d_ntracks; - __le32 d_ncylinders; - __le32 d_secpercyl; - __le32 d_secprtunit; - __le16 d_sparespertrack; - __le16 d_sparespercyl; - __le32 d_acylinders; - __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; - __le32 d_headswitch, d_trkseek, d_flags; - __le32 d_drivedata[5]; - __le32 d_spare[5]; - __le32 d_magic2; - __le16 d_checksum; - __le16 d_npartitions; - __le32 d_bbsize, d_sbsize; - struct d_partition { - __le32 p_size; - __le32 p_offset; - __le32 p_fsize; - u8 p_fstype; - u8 p_frag; - __le16 p_cpg; - } d_partitions[MAX_OSF_PARTITIONS]; - } * label; - struct d_partition * partition; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - label = (struct disklabel *) (data+64); - partition = label->d_partitions; - if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { - put_dev_sector(sect); - return 0; - } - if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { - put_dev_sector(sect); - return 0; - } - npartitions = le16_to_cpu(label->d_npartitions); - if (npartitions > MAX_OSF_PARTITIONS) { - put_dev_sector(sect); - return 0; - } - for (i = 0 ; i < npartitions; i++, partition++) { - if (slot == state->limit) - break; - if (le32_to_cpu(partition->p_size)) - put_partition(state, slot, - le32_to_cpu(partition->p_offset), - le32_to_cpu(partition->p_size)); - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h deleted file mode 100644 index 20ed2315ec1..00000000000 --- a/fs/partitions/osf.h +++ /dev/null @@ -1,7 +0,0 @@ -/* - * fs/partitions/osf.h - */ - -#define DISKLABELMAGIC (0x82564557UL) - -int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c deleted file mode 100644 index ea8a86dceaf..00000000000 --- a/fs/partitions/sgi.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * fs/partitions/sgi.c - * - * Code extracted from drivers/block/genhd.c - */ - -#include "check.h" -#include "sgi.h" - -struct sgi_disklabel { - __be32 magic_mushroom; /* Big fat spliff... */ - __be16 root_part_num; /* Root partition number */ - __be16 swap_part_num; /* Swap partition number */ - s8 boot_file[16]; /* Name of boot file for ARCS */ - u8 _unused0[48]; /* Device parameter useless crapola.. */ - struct sgi_volume { - s8 name[8]; /* Name of volume */ - __be32 block_num; /* Logical block number */ - __be32 num_bytes; /* How big, in bytes */ - } volume[15]; - struct sgi_partition { - __be32 num_blocks; /* Size in logical blocks */ - __be32 first_block; /* First logical block */ - __be32 type; /* Type of this partition */ - } partitions[16]; - __be32 csum; /* Disk label checksum */ - __be32 _unused1; /* Padding */ -}; - -int sgi_partition(struct parsed_partitions *state) -{ - int i, csum; - __be32 magic; - int slot = 1; - unsigned int start, blocks; - __be32 *ui, cs; - Sector sect; - struct sgi_disklabel *label; - struct sgi_partition *p; - char b[BDEVNAME_SIZE]; - - label = read_part_sector(state, 0, §); - if (!label) - return -1; - p = &label->partitions[0]; - magic = label->magic_mushroom; - if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ - put_dev_sector(sect); - return 0; - } - ui = ((__be32 *) (label + 1)) - 1; - for(csum = 0; ui >= ((__be32 *) label);) { - cs = *ui--; - csum += be32_to_cpu(cs); - } - if(csum) { - printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); - put_dev_sector(sect); - return 0; - } - /* All SGI disk labels have 16 partitions, disks under Linux only - * have 15 minor's. Luckily there are always a few zero length - * partitions which we don't care about so we never overflow the - * current_minor. - */ - for(i = 0; i < 16; i++, p++) { - blocks = be32_to_cpu(p->num_blocks); - start = be32_to_cpu(p->first_block); - if (blocks) { - put_partition(state, slot, start, blocks); - if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) - state->parts[slot].flags = ADDPART_FLAG_RAID; - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h deleted file mode 100644 index b9553ebdd5a..00000000000 --- a/fs/partitions/sgi.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/sgi.h - */ - -extern int sgi_partition(struct parsed_partitions *state); - -#define SGI_LABEL_MAGIC 0x0be5a941 - diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c deleted file mode 100644 index b5b6fcfb3d3..00000000000 --- a/fs/partitions/sun.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * fs/partitions/sun.c - * - * Code extracted from drivers/block/genhd.c - * - * Copyright (C) 1991-1998 Linus Torvalds - * Re-organised Feb 1998 Russell King - */ - -#include "check.h" -#include "sun.h" - -int sun_partition(struct parsed_partitions *state) -{ - int i; - __be16 csum; - int slot = 1; - __be16 *ush; - Sector sect; - struct sun_disklabel { - unsigned char info[128]; /* Informative text string */ - struct sun_vtoc { - __be32 version; /* Layout version */ - char volume[8]; /* Volume name */ - __be16 nparts; /* Number of partitions */ - struct sun_info { /* Partition hdrs, sec 2 */ - __be16 id; - __be16 flags; - } infos[8]; - __be16 padding; /* Alignment padding */ - __be32 bootinfo[3]; /* Info needed by mboot */ - __be32 sanity; /* To verify vtoc sanity */ - __be32 reserved[10]; /* Free space */ - __be32 timestamp[8]; /* Partition timestamp */ - } vtoc; - __be32 write_reinstruct; /* sectors to skip, writes */ - __be32 read_reinstruct; /* sectors to skip, reads */ - unsigned char spare[148]; /* Padding */ - __be16 rspeed; /* Disk rotational speed */ - __be16 pcylcount; /* Physical cylinder count */ - __be16 sparecyl; /* extra sects per cylinder */ - __be16 obs1; /* gap1 */ - __be16 obs2; /* gap2 */ - __be16 ilfact; /* Interleave factor */ - __be16 ncyl; /* Data cylinder count */ - __be16 nacyl; /* Alt. cylinder count */ - __be16 ntrks; /* Tracks per cylinder */ - __be16 nsect; /* Sectors per track */ - __be16 obs3; /* bhead - Label head offset */ - __be16 obs4; /* ppart - Physical Partition */ - struct sun_partition { - __be32 start_cylinder; - __be32 num_sectors; - } partitions[8]; - __be16 magic; /* Magic number */ - __be16 csum; /* Label xor'd checksum */ - } * label; - struct sun_partition *p; - unsigned long spc; - char b[BDEVNAME_SIZE]; - int use_vtoc; - int nparts; - - label = read_part_sector(state, 0, §); - if (!label) - return -1; - - p = label->partitions; - if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ - put_dev_sector(sect); - return 0; - } - /* Look at the checksum */ - ush = ((__be16 *) (label+1)) - 1; - for (csum = 0; ush >= ((__be16 *) label);) - csum ^= *ush--; - if (csum) { - printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); - put_dev_sector(sect); - return 0; - } - - /* Check to see if we can use the VTOC table */ - use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && - (be32_to_cpu(label->vtoc.version) == 1) && - (be16_to_cpu(label->vtoc.nparts) <= 8)); - - /* Use 8 partition entries if not specified in validated VTOC */ - nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; - - /* - * So that old Linux-Sun partitions continue to work, - * alow the VTOC to be used under the additional condition ... - */ - use_vtoc = use_vtoc || !(label->vtoc.sanity || - label->vtoc.version || label->vtoc.nparts); - spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); - for (i = 0; i < nparts; i++, p++) { - unsigned long st_sector; - unsigned int num_sectors; - - st_sector = be32_to_cpu(p->start_cylinder) * spc; - num_sectors = be32_to_cpu(p->num_sectors); - if (num_sectors) { - put_partition(state, slot, st_sector, num_sectors); - state->parts[slot].flags = 0; - if (use_vtoc) { - if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) - state->parts[slot].flags |= ADDPART_FLAG_RAID; - else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) - state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; - } - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h deleted file mode 100644 index 2424baa8319..00000000000 --- a/fs/partitions/sun.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * fs/partitions/sun.h - */ - -#define SUN_LABEL_MAGIC 0xDABE -#define SUN_VTOC_SANITY 0x600DDEEE - -int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c deleted file mode 100644 index 9627ccffc1c..00000000000 --- a/fs/partitions/sysv68.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * fs/partitions/sysv68.c - * - * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be> - */ - -#include "check.h" -#include "sysv68.h" - -/* - * Volume ID structure: on first 256-bytes sector of disk - */ - -struct volumeid { - u8 vid_unused[248]; - u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ -}; - -/* - * config block: second 256-bytes sector on disk - */ - -struct dkconfig { - u8 ios_unused0[128]; - __be32 ios_slcblk; /* Slice table block number */ - __be16 ios_slccnt; /* Number of entries in slice table */ - u8 ios_unused1[122]; -}; - -/* - * combined volumeid and dkconfig block - */ - -struct dkblk0 { - struct volumeid dk_vid; - struct dkconfig dk_ios; -}; - -/* - * Slice Table Structure - */ - -struct slice { - __be32 nblocks; /* slice size (in blocks) */ - __be32 blkoff; /* block offset of slice */ -}; - - -int sysv68_partition(struct parsed_partitions *state) -{ - int i, slices; - int slot = 1; - Sector sect; - unsigned char *data; - struct dkblk0 *b; - struct slice *slice; - char tmp[64]; - - data = read_part_sector(state, 0, §); - if (!data) - return -1; - - b = (struct dkblk0 *)data; - if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { - put_dev_sector(sect); - return 0; - } - slices = be16_to_cpu(b->dk_ios.ios_slccnt); - i = be32_to_cpu(b->dk_ios.ios_slcblk); - put_dev_sector(sect); - - data = read_part_sector(state, i, §); - if (!data) - return -1; - - slices -= 1; /* last slice is the whole disk */ - snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - slice = (struct slice *)data; - for (i = 0; i < slices; i++, slice++) { - if (slot == state->limit) - break; - if (be32_to_cpu(slice->nblocks)) { - put_partition(state, slot, - be32_to_cpu(slice->blkoff), - be32_to_cpu(slice->nblocks)); - snprintf(tmp, sizeof(tmp), "(s%u)", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - slot++; - } - strlcat(state->pp_buf, "\n", PAGE_SIZE); - put_dev_sector(sect); - return 1; -} diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h deleted file mode 100644 index bf2f5ffa97a..00000000000 --- a/fs/partitions/sysv68.h +++ /dev/null @@ -1 +0,0 @@ -extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c deleted file mode 100644 index 8dbaf9f77a9..00000000000 --- a/fs/partitions/ultrix.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * fs/partitions/ultrix.c - * - * Code extracted from drivers/block/genhd.c - * - * Re-organised Jul 1999 Russell King - */ - -#include "check.h" -#include "ultrix.h" - -int ultrix_partition(struct parsed_partitions *state) -{ - int i; - Sector sect; - unsigned char *data; - struct ultrix_disklabel { - s32 pt_magic; /* magic no. indicating part. info exits */ - s32 pt_valid; /* set by driver if pt is current */ - struct pt_info { - s32 pi_nblocks; /* no. of sectors */ - u32 pi_blkoff; /* block offset for start */ - } pt_part[8]; - } *label; - -#define PT_MAGIC 0x032957 /* Partition magic number */ -#define PT_VALID 1 /* Indicates if struct is valid */ - - data = read_part_sector(state, (16384 - sizeof(*label))/512, §); - if (!data) - return -1; - - label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); - - if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { - for (i=0; i<8; i++) - if (label->pt_part[i].pi_nblocks) - put_partition(state, i+1, - label->pt_part[i].pi_blkoff, - label->pt_part[i].pi_nblocks); - put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); - return 1; - } else { - put_dev_sector(sect); - return 0; - } -} diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h deleted file mode 100644 index a3cc00b2bde..00000000000 --- a/fs/partitions/ultrix.h +++ /dev/null @@ -1,5 +0,0 @@ -/* - * fs/partitions/ultrix.h - */ - -int ultrix_partition(struct parsed_partitions *state); diff --git a/fs/pipe.c b/fs/pipe.c index 4065f07366b..a932ced92a1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; @@ -1290,11 +1290,4 @@ static int __init init_pipe_fs(void) return err; } -static void __exit exit_pipe_fs(void) -{ - kern_unmount(pipe_mnt); - unregister_filesystem(&pipe_fs_type); -} - fs_initcall(init_pipe_fs); -module_exit(exit_pipe_fs); diff --git a/fs/pnode.c b/fs/pnode.c index d42514e3238..ab5fa9e1a79 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -13,45 +13,30 @@ #include "pnode.h" /* return the next shared peer mount of @p */ -static inline struct vfsmount *next_peer(struct vfsmount *p) +static inline struct mount *next_peer(struct mount *p) { - return list_entry(p->mnt_share.next, struct vfsmount, mnt_share); + return list_entry(p->mnt_share.next, struct mount, mnt_share); } -static inline struct vfsmount *first_slave(struct vfsmount *p) +static inline struct mount *first_slave(struct mount *p) { - return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave); + return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } -static inline struct vfsmount *next_slave(struct vfsmount *p) +static inline struct mount *next_slave(struct mount *p) { - return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); + return list_entry(p->mnt_slave.next, struct mount, mnt_slave); } -/* - * Return true if path is reachable from root - * - * namespace_sem is held, and mnt is attached - */ -static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry, - const struct path *root) -{ - while (mnt != root->mnt && mnt->mnt_parent != mnt) { - dentry = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - return mnt == root->mnt && is_subdir(dentry, root->dentry); -} - -static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, - struct mnt_namespace *ns, - const struct path *root) +static struct mount *get_peer_under_root(struct mount *mnt, + struct mnt_namespace *ns, + const struct path *root) { - struct vfsmount *m = mnt; + struct mount *m = mnt; do { /* Check the namespace first for optimization */ - if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root)) + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); @@ -66,12 +51,12 @@ static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, * * Caller must hold namespace_sem */ -int get_dominating_id(struct vfsmount *mnt, const struct path *root) +int get_dominating_id(struct mount *mnt, const struct path *root) { - struct vfsmount *m; + struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { - struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root); + struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } @@ -79,10 +64,10 @@ int get_dominating_id(struct vfsmount *mnt, const struct path *root) return 0; } -static int do_make_slave(struct vfsmount *mnt) +static int do_make_slave(struct mount *mnt) { - struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; - struct vfsmount *slave_mnt; + struct mount *peer_mnt = mnt, *master = mnt->mnt_master; + struct mount *slave_mnt; /* * slave 'mnt' to a peer mount that has the @@ -90,7 +75,7 @@ static int do_make_slave(struct vfsmount *mnt) * slave it to anything that is available. */ while ((peer_mnt = next_peer(peer_mnt)) != mnt && - peer_mnt->mnt_root != mnt->mnt_root) ; + peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ; if (peer_mnt == mnt) { peer_mnt = next_peer(mnt); @@ -116,7 +101,7 @@ static int do_make_slave(struct vfsmount *mnt) struct list_head *p = &mnt->mnt_slave_list; while (!list_empty(p)) { slave_mnt = list_first_entry(p, - struct vfsmount, mnt_slave); + struct mount, mnt_slave); list_del_init(&slave_mnt->mnt_slave); slave_mnt->mnt_master = NULL; } @@ -129,7 +114,7 @@ static int do_make_slave(struct vfsmount *mnt) /* * vfsmount lock must be held for write */ -void change_mnt_propagation(struct vfsmount *mnt, int type) +void change_mnt_propagation(struct mount *mnt, int type) { if (type == MS_SHARED) { set_mnt_shared(mnt); @@ -140,9 +125,9 @@ void change_mnt_propagation(struct vfsmount *mnt, int type) list_del_init(&mnt->mnt_slave); mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) - mnt->mnt_flags |= MNT_UNBINDABLE; + mnt->mnt.mnt_flags |= MNT_UNBINDABLE; else - mnt->mnt_flags &= ~MNT_UNBINDABLE; + mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; } } @@ -156,20 +141,19 @@ void change_mnt_propagation(struct vfsmount *mnt, int type) * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ -static struct vfsmount *propagation_next(struct vfsmount *m, - struct vfsmount *origin) +static struct mount *propagation_next(struct mount *m, + struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { - struct vfsmount *next; - struct vfsmount *master = m->mnt_master; + struct mount *master = m->mnt_master; if (master == origin->mnt_master) { - next = next_peer(m); - return ((next == origin) ? NULL : next); + struct mount *next = next_peer(m); + return (next == origin) ? NULL : next; } else if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); @@ -187,13 +171,13 @@ static struct vfsmount *propagation_next(struct vfsmount *m, * @type return CL_SLAVE if the new mount has to be * cloned as a slave. */ -static struct vfsmount *get_source(struct vfsmount *dest, - struct vfsmount *last_dest, - struct vfsmount *last_src, - int *type) +static struct mount *get_source(struct mount *dest, + struct mount *last_dest, + struct mount *last_src, + int *type) { - struct vfsmount *p_last_src = NULL; - struct vfsmount *p_last_dest = NULL; + struct mount *p_last_src = NULL; + struct mount *p_last_dest = NULL; while (last_dest != dest->mnt_master) { p_last_dest = last_dest; @@ -233,33 +217,33 @@ static struct vfsmount *get_source(struct vfsmount *dest, * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ -int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, - struct vfsmount *source_mnt, struct list_head *tree_list) +int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, + struct mount *source_mnt, struct list_head *tree_list) { - struct vfsmount *m, *child; + struct mount *m, *child; int ret = 0; - struct vfsmount *prev_dest_mnt = dest_mnt; - struct vfsmount *prev_src_mnt = source_mnt; + struct mount *prev_dest_mnt = dest_mnt; + struct mount *prev_src_mnt = source_mnt; LIST_HEAD(tmp_list); LIST_HEAD(umount_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { int type; - struct vfsmount *source; + struct mount *source; if (IS_MNT_NEW(m)) continue; source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - if (!(child = copy_tree(source, source->mnt_root, type))) { + if (!(child = copy_tree(source, source->mnt.mnt_root, type))) { ret = -ENOMEM; list_splice(tree_list, tmp_list.prev); goto out; } - if (is_subdir(dest_dentry, m->mnt_root)) { + if (is_subdir(dest_dentry, m->mnt.mnt_root)) { mnt_set_mountpoint(m, dest_dentry, child); list_add_tail(&child->mnt_hash, tree_list); } else { @@ -275,7 +259,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, out: br_write_lock(vfsmount_lock); while (!list_empty(&tmp_list)) { - child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); + child = list_first_entry(&tmp_list, struct mount, mnt_hash); umount_tree(child, 0, &umount_list); } br_write_unlock(vfsmount_lock); @@ -286,7 +270,7 @@ out: /* * return true if the refcount is greater than count */ -static inline int do_refcount_check(struct vfsmount *mnt, int count) +static inline int do_refcount_check(struct mount *mnt, int count) { int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts; return (mycount > count); @@ -302,10 +286,10 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count) * * vfsmount lock must be held for write */ -int propagate_mount_busy(struct vfsmount *mnt, int refcnt) +int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct vfsmount *m, *child; - struct vfsmount *parent = mnt->mnt_parent; + struct mount *m, *child; + struct mount *parent = mnt->mnt_parent; int ret = 0; if (mnt == parent) @@ -321,7 +305,7 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt(m, mnt->mnt_mountpoint, 0); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0); if (child && list_empty(&child->mnt_mounts) && (ret = do_refcount_check(child, 1))) break; @@ -333,17 +317,17 @@ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct vfsmount *mnt) +static void __propagate_umount(struct mount *mnt) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - struct vfsmount *child = __lookup_mnt(m, + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0); /* * umount the child only if the child has no @@ -363,7 +347,7 @@ static void __propagate_umount(struct vfsmount *mnt) */ int propagate_umount(struct list_head *list) { - struct vfsmount *mnt; + struct mount *mnt; list_for_each_entry(mnt, list, mnt_hash) __propagate_umount(mnt); diff --git a/fs/pnode.h b/fs/pnode.h index 1ea4ae1efcd..65c60979d54 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -9,13 +9,13 @@ #define _LINUX_PNODE_H #include <linux/list.h> -#include <linux/mount.h> +#include "mount.h" -#define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED) -#define IS_MNT_SLAVE(mnt) (mnt->mnt_master) -#define IS_MNT_NEW(mnt) (!mnt->mnt_ns) -#define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED) -#define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) +#define IS_MNT_SLAVE(m) ((m)->mnt_master) +#define IS_MNT_NEW(m) (!(m)->mnt_ns) +#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) +#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -23,17 +23,25 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 -static inline void set_mnt_shared(struct vfsmount *mnt) +static inline void set_mnt_shared(struct mount *mnt) { - mnt->mnt_flags &= ~MNT_SHARED_MASK; - mnt->mnt_flags |= MNT_SHARED; + mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; + mnt->mnt.mnt_flags |= MNT_SHARED; } -void change_mnt_propagation(struct vfsmount *, int); -int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *, +void change_mnt_propagation(struct mount *, int); +int propagate_mnt(struct mount *, struct dentry *, struct mount *, struct list_head *); int propagate_umount(struct list_head *); -int propagate_mount_busy(struct vfsmount *, int); -void mnt_release_group_id(struct vfsmount *); -int get_dominating_id(struct vfsmount *mnt, const struct path *root); +int propagate_mount_busy(struct mount *, int); +void mnt_release_group_id(struct mount *); +int get_dominating_id(struct mount *mnt, const struct path *root); +unsigned int mnt_get_count(struct mount *mnt); +void mnt_set_mountpoint(struct mount *, struct dentry *, + struct mount *); +void release_mounts(struct list_head *); +void umount_tree(struct mount *, int, struct list_head *); +struct mount *copy_tree(struct mount *, struct dentry *, int); +bool is_path_reachable(struct mount *, struct dentry *, + const struct path *root); #endif /* _LINUX_PNODE_H */ diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd228d..c602b8d20f0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ); + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); @@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; - cgtime = gtime = cputime_zero; + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, t->gtime); + gtime += t->gtime; t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; thread_group_times(task, &utime, &stime); - gtime = cputime_add(gtime, sig->gtime); + gtime += sig->gtime; } sid = task_session_nr_ns(task, ns); @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 851ba3dcdc2..d4548dd49b0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,9 +83,11 @@ #include <linux/pid_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/flex_array.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif +#include <trace/events/oom.h> #include "internal.h" /* NOTE: @@ -101,7 +103,7 @@ struct pid_entry { char *name; int len; - mode_t mode; + umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; @@ -133,6 +135,8 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) +static int proc_fd_permission(struct inode *inode, int mask); + /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -194,82 +198,9 @@ static int proc_root_link(struct inode *inode, struct path *path) return result; } -static struct mm_struct *__check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - - mm = get_task_mm(task); - if (!mm) - return ERR_PTR(-EINVAL); - - /* - * A task can always look at itself, in case it chooses - * to use system calls instead of load instructions. - */ - if (task == current) - return mm; - - /* - * If current is actively ptrace'ing, and would also be - * permitted to freshly attach with ptrace now, permit it. - */ - if (task_is_stopped_or_traced(task)) { - int match; - rcu_read_lock(); - match = (ptrace_parent(task) == current); - rcu_read_unlock(); - if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) - return mm; - } - - /* - * No one else is allowed. - */ - mmput(mm); - return ERR_PTR(-EPERM); -} - -/* - * If current may access user memory in @task return a reference to the - * corresponding mm, otherwise ERR_PTR. - */ -static struct mm_struct *check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - int err; - - /* - * Avoid racing if task exec's as we might get a new mm but validate - * against old credentials. - */ - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = __check_mem_permission(task); - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, PTRACE_MODE_READ)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; + return mm_access(task, PTRACE_MODE_READ); } static int proc_pid_cmdline(struct task_struct *task, char * buffer) @@ -627,122 +558,54 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static const struct inode_operations proc_def_inode_operations = { - .setattr = proc_setattr, -}; - -static int mounts_open_common(struct inode *inode, struct file *file, - const struct seq_operations *op) +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) { - struct task_struct *task = get_proc_task(inode); - struct nsproxy *nsp; - struct mnt_namespace *ns = NULL; - struct path root; - struct proc_mounts *p; - int ret = -EINVAL; - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - ns = nsp->mnt_ns; - if (ns) - get_mnt_ns(ns); - } - rcu_read_unlock(); - if (ns && get_task_root(task, &root) == 0) - ret = 0; - put_task_struct(task); - } - - if (!ns) - goto err; - if (ret) - goto err_put_ns; - - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) - goto err_put_path; - - file->private_data = &p->m; - ret = seq_open(file, op); - if (ret) - goto err_free; - - p->m.private = p; - p->ns = ns; - p->root = root; - p->m.poll_event = ns->event; - - return 0; - - err_free: - kfree(p); - err_put_path: - path_put(&root); - err_put_ns: - put_mnt_ns(ns); - err: - return ret; + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); } -static int mounts_release(struct inode *inode, struct file *file) -{ - struct proc_mounts *p = file->private_data; - path_put(&p->root); - put_mnt_ns(p->ns); - return seq_release(inode, file); -} -static unsigned mounts_poll(struct file *file, poll_table *wait) +static int proc_pid_permission(struct inode *inode, int mask) { - struct proc_mounts *p = file->private_data; - unsigned res = POLLIN | POLLRDNORM; - - poll_wait(file, &p->ns->poll, wait); - if (mnt_had_events(p)) - res |= POLLERR | POLLPRI; - - return res; -} + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; -static int mounts_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mounts_op); -} + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); -static const struct file_operations proc_mounts_operations = { - .open = mounts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } -static int mountinfo_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountinfo_op); + return -EPERM; + } + return generic_permission(inode, mask); } -static const struct file_operations proc_mountinfo_operations = { - .open = mountinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; -static int mountstats_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountstats_op); -} -static const struct file_operations proc_mountstats_operations = { - .open = mountstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, }; #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ @@ -816,133 +679,96 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { - file->private_data = (void*)((long)current->self_exec_id); - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; - return 0; -} - -static ssize_t mem_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char *page; - unsigned long src = *ppos; - int ret = -ESRCH; struct mm_struct *mm; if (!task) - goto out_no_task; + return -ESRCH; - ret = -ENOMEM; - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - goto out; + mm = mm_access(task, PTRACE_MODE_ATTACH); + put_task_struct(task); - mm = check_mem_permission(task); - ret = PTR_ERR(mm); if (IS_ERR(mm)) - goto out_free; + return PTR_ERR(mm); - ret = -EIO; - - if (file->private_data != (void*)((long)current->self_exec_id)) - goto out_put; - - ret = 0; - - while (count > 0) { - int this_len, retval; - - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_remote_vm(mm, src, page, this_len, 0); - if (!retval) { - if (!ret) - ret = -EIO; - break; - } - - if (copy_to_user(buf, page, retval)) { - ret = -EFAULT; - break; - } - - ret += retval; - src += retval; - buf += retval; - count -= retval; + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); } - *ppos = src; -out_put: - mmput(mm); -out_free: - free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: - return ret; + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + file->private_data = mm; + + return 0; } -static ssize_t mem_write(struct file * file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) { - int copied; + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; char *page; - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - unsigned long dst = *ppos; - struct mm_struct *mm; - copied = -ESRCH; - if (!task) - goto out_no_task; + if (!mm) + return 0; - copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out_task; - - mm = check_mem_permission(task); - copied = PTR_ERR(mm); - if (IS_ERR(mm)) - goto out_free; - - copied = -EIO; - if (file->private_data != (void *)((long)current->self_exec_id)) - goto out_mm; + return -ENOMEM; copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { - int this_len, retval; + int this_len = min_t(int, count, PAGE_SIZE); - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } - retval = access_remote_vm(mm, dst, page, this_len, 1); - if (!retval) { + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { if (!copied) copied = -EIO; break; } - copied += retval; - buf += retval; - dst += retval; - count -= retval; + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; } - *ppos = dst; + *ppos = addr; -out_mm: mmput(mm); -out_free: +free: free_page((unsigned long) page); -out_task: - put_task_struct(task); -out_no_task: return copied; } +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { @@ -959,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, + .release = mem_release, }; static ssize_t environ_read(struct file *file, char __user *buf, @@ -1124,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1211,6 +1047,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. @@ -1261,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, ssize_t length; uid_t loginuid; - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); @@ -1292,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(current, loginuid); + length = audit_set_loginuid(loginuid); if (likely(length == 0)) length = count; @@ -1567,13 +1401,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1603,7 +1437,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); out: return ERR_PTR(error); } @@ -1642,7 +1476,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1723,6 +1557,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1731,6 +1566,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = 0; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -1934,9 +1777,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) return -ENOENT; } -static int proc_fd_link(struct inode *inode, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(inode, path, NULL); + return proc_fd_info(dentry->d_inode, path, NULL); } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -2157,6 +2000,355 @@ static const struct file_operations proc_fd_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_unlock; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (lock_trace(task)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_unlock; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). @@ -2772,6 +2964,9 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET @@ -2875,6 +3070,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -3078,6 +3274,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi proc_pid_instantiate, iter.task, NULL); } +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -3085,6 +3287,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out_no_task; @@ -3106,8 +3309,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { put_task_struct(iter.task); goto out; } @@ -3442,6 +3650,7 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 10090d9c7ad..2edf34f2eb6 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -597,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, const char *name, - mode_t mode, + umode_t mode, nlink_t nlink) { struct proc_dir_entry *ent = NULL; @@ -659,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -699,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, +struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -728,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, } EXPORT_SYMBOL(create_proc_entry); -struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7737c5468a4..84fd3235a59 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/pid_namespace.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> @@ -17,7 +18,9 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/sysctl.h> +#include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/mount.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -77,7 +80,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } @@ -102,12 +104,27 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; static void __pde_users_dec(struct proc_dir_entry *pde) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7838e5cfec1..292577531ad 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde); int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index be177f702ac..27da860115c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -9,7 +9,6 @@ #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> -#include <linux/mnt_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index f738024ccc8..06e1cc17caf 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = { struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, mode_t mode, const struct file_operations *fops) + const char *name, umode_t mode, const struct file_operations *fops) { return proc_create(name, mode, net->proc_net, fops); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 03102d97818..46a15d8a29c 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/mount.h> #include <linux/pid_namespace.h> +#include <linux/parser.h> #include "internal.h" @@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data) return err; } +enum { + Opt_gid, Opt_hidepid, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else + options = NULL; + } else { ns = current->nsproxy->pid_ns; + options = data; + } sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) @@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (!sb->s_root) { sb->s_flags = flags; + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0855e6f2039..121f77cfef7 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,29 +22,27 @@ #define arch_idle_time(cpu) 0 #endif -static cputime64_t get_idle_time(int cpu) +static u64 get_idle_time(int cpu) { - u64 idle_time = get_cpu_idle_time_us(cpu, NULL); - cputime64_t idle; + u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) { /* !NO_HZ so we can rely on cpustat.idle */ - idle = kstat_cpu(cpu).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(cpu)); + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle += arch_idle_time(cpu); } else idle = usecs_to_cputime64(idle_time); return idle; } -static cputime64_t get_iowait_time(int cpu) +static u64 get_iowait_time(int cpu) { - u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); - cputime64_t iowait; + u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); if (iowait_time == -1ULL) /* !NO_HZ so we can rely on cpustat.iowait */ - iowait = kstat_cpu(cpu).cpustat.iowait; + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else iowait = usecs_to_cputime64(iowait_time); @@ -55,31 +53,30 @@ static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest, guest_nice; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, get_idle_time(i)); - iowait = cputime64_add(iowait, get_iowait_time(i)); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -106,16 +103,16 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(i); iowait = get_iowait_time(i); - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - guest_nice = kstat_cpu(i).cpustat.guest_nice; + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " "%llu\n", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e418c5abdb0..7dcd2a25049 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; + if (PageReserved(page)) + continue; + /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d45605..9610ac772d7 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = cputime_zero; + idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c new file mode 100644 index 00000000000..12412852d88 --- /dev/null +++ b/fs/proc_namespace.c @@ -0,0 +1,333 @@ +/* + * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} + * + * In fact, that's a piece of procfs; it's *almost* isolated from + * the rest of fs/proc, but has rather close relationships with + * fs/namespace.c, thus here instead of fs/proc + * + */ +#include <linux/mnt_namespace.h> +#include <linux/nsproxy.h> +#include <linux/security.h> +#include <linux/fs_struct.h> +#include "proc/internal.h" /* only for get_proc_task() in ->open() */ + +#include "pnode.h" +#include "internal.h" + +static unsigned mounts_poll(struct file *file, poll_table *wait) +{ + struct proc_mounts *p = file->private_data; + struct mnt_namespace *ns = p->ns; + unsigned res = POLLIN | POLLRDNORM; + + poll_wait(file, &p->ns->poll, wait); + + br_read_lock(vfsmount_lock); + if (p->m.poll_event != ns->event) { + p->m.poll_event = ns->event; + res |= POLLERR | POLLPRI; + } + br_read_unlock(vfsmount_lock); + + return res; +} + +struct proc_fs_info { + int flag; + const char *str; +}; + +static int show_sb_opts(struct seq_file *m, struct super_block *sb) +{ + static const struct proc_fs_info fs_info[] = { + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + return security_sb_show_options(m, sb); +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { + { MNT_NOSUID, ",nosuid" }, + { MNT_NODEV, ",nodev" }, + { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, + { 0, NULL } + }; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\"); +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) +{ + struct mount *r = real_mount(mnt); + int err = 0; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; + } else { + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + } + seq_putc(m, ' '); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + show_type(m, sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + show_mnt_opts(m, mnt); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt_path.dentry); + seq_puts(m, " 0 0\n"); +out: + return err; +} + +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + struct mount *r = real_mount(mnt); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct path root = p->root; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) + err = sb->s_op->show_path(m, mnt->mnt_root); + else + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(r)) + seq_printf(m, " shared:%i", r->mnt_group_id); + if (IS_MNT_SLAVE(r)) { + int master = r->mnt_master->mnt_group_id; + int dom = get_dominating_id(r, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(r)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + if (sb->s_op->show_devname) + err = sb->s_op->show_devname(m, mnt->mnt_root); + else + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + if (err) + goto out; + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt->mnt_root); + seq_putc(m, '\n'); +out: + return err; +} + +static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) +{ + struct mount *r = real_mount(mnt); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + int err = 0; + + /* device */ + if (sb->s_op->show_devname) { + seq_puts(m, "device "); + err = sb->s_op->show_devname(m, mnt_path.dentry); + } else { + if (r->mnt_devname) { + seq_puts(m, "device "); + mangle(m, r->mnt_devname); + } else + seq_puts(m, "no device"); + } + + /* mount point */ + seq_puts(m, " mounted on "); + seq_path(m, &mnt_path, " \t\n\\"); + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, sb); + + /* optional statistics */ + if (sb->s_op->show_stats) { + seq_putc(m, ' '); + if (!err) + err = sb->s_op->show_stats(m, mnt_path.dentry); + } + + seq_putc(m, '\n'); + return err; +} + +static int mounts_open_common(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, struct vfsmount *)) +{ + struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; + struct mnt_namespace *ns = NULL; + struct path root; + struct proc_mounts *p; + int ret = -EINVAL; + + if (!task) + goto err; + + rcu_read_lock(); + nsp = task_nsproxy(task); + if (!nsp) { + rcu_read_unlock(); + put_task_struct(task); + goto err; + } + ns = nsp->mnt_ns; + if (!ns) { + rcu_read_unlock(); + put_task_struct(task); + goto err; + } + get_mnt_ns(ns); + rcu_read_unlock(); + task_lock(task); + if (!task->fs) { + task_unlock(task); + put_task_struct(task); + ret = -ENOENT; + goto err_put_ns; + } + get_fs_root(task->fs, &root); + task_unlock(task); + put_task_struct(task); + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, &mounts_op); + if (ret) + goto err_free; + + p->m.private = p; + p->ns = ns; + p->root = root; + p->m.poll_event = ns->event; + p->show = show; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: + return ret; +} + +static int mounts_release(struct inode *inode, struct file *file) +{ + struct proc_mounts *p = file->private_data; + path_put(&p->root); + put_mnt_ns(p->ns); + return seq_release(inode, file); +} + +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsmnt); +} + +static int mountinfo_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_mountinfo); +} + +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsstat); +} + +const struct file_operations proc_mounts_operations = { + .open = mounts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 379a02dc121..b3b426edb2f 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -80,7 +80,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->psi->erase(p->type, p->id, p->psi); + if (p->psi->erase) + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 57bbf9078ac..9ec22d3b429 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -122,7 +122,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - ret = psinfo->write(PSTORE_TYPE_DMESG, &id, part, + ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, hsize + l1_cpy + l2_cpy, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -207,8 +207,7 @@ void pstore_get_records(int quiet) return; mutex_lock(&psi->read_mutex); - rc = psi->open(psi); - if (rc) + if (psi->open && psi->open(psi)) goto out; while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { @@ -219,7 +218,8 @@ void pstore_get_records(int quiet) if (rc && (rc != -EEXIST || !quiet)) failed++; } - psi->close(psi); + if (psi->close) + psi->close(psi); out: mutex_unlock(&psi->read_mutex); @@ -243,33 +243,5 @@ static void pstore_timefunc(unsigned long dummy) mod_timer(&pstore_timer, jiffies + PSTORE_INTERVAL); } -/* - * Call platform driver to write a record to the - * persistent store. - */ -int pstore_write(enum pstore_type_id type, char *buf, size_t size) -{ - u64 id; - int ret; - unsigned long flags; - - if (!psinfo) - return -ENODEV; - - if (size > psinfo->bufsize) - return -EFBIG; - - spin_lock_irqsave(&psinfo->buf_lock, flags); - memcpy(psinfo->buf, buf, size); - ret = psinfo->write(type, &id, 0, size, psinfo); - if (ret == 0 && pstore_is_mounted()) - pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo); - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(pstore_write); - module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3bdd2141843..6b009548d2e 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -179,46 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb) struct qnx4_inode_entry *rootdir; int rd, rl; int i, j; - int found = 0; - if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { + if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') return "no qnx4 filesystem (no root dir)."; - } else { - QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); - rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; - rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); - for (j = 0; j < rl; j++) { - bh = sb_bread(sb, rd + j); /* root dir, first block */ - if (bh == NULL) { - return "unable to read root entry."; - } - for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { - rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); - if (rootdir->di_fname != NULL) { - QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); - if (!strcmp(rootdir->di_fname, - QNX4_BMNAME)) { - found = 1; - qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); - if (!qnx4_sb(sb)->BitMap) { - brelse (bh); - return "not enough memory for bitmap inode"; - } - memcpy( qnx4_sb(sb)->BitMap, rootdir, sizeof( struct qnx4_inode_entry ) ); /* keep bitmap inode known */ - break; - } - } - } + QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); + rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); + for (j = 0; j < rl; j++) { + bh = sb_bread(sb, rd + j); /* root dir, first block */ + if (bh == NULL) + return "unable to read root entry."; + rootdir = (struct qnx4_inode_entry *) bh->b_data; + for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) { + QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); + if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0) + continue; + qnx4_sb(sb)->BitMap = kmemdup(rootdir, + sizeof(struct qnx4_inode_entry), + GFP_KERNEL); brelse(bh); - if (found != 0) { - break; - } - } - if (found == 0) { - return "bitmap file not found."; + if (!qnx4_sb(sb)->BitMap) + return "not enough memory for bitmap inode"; + /* keep bitmap inode known */ + return NULL; } + brelse(bh); } - return NULL; + return "bitmap file not found."; } static int qnx4_fill_super(struct super_block *s, void *data, int silent) @@ -269,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) if (IS_ERR(root)) { printk(KERN_ERR "qnx4: get inode failed\n"); ret = PTR_ERR(root); - goto out; + goto outb; } ret = -ENOMEM; @@ -282,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) outi: iput(root); + outb: + kfree(qs->BitMap); out: brelse(bh); outnobh: @@ -427,7 +416,6 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb) static void qnx4_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 5b572c89e6c..46741970371 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -73,7 +73,6 @@ #include <linux/security.h> #include <linux/kmod.h> #include <linux/namei.h> -#include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> #include "../internal.h" /* ugh */ @@ -2126,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, mutex_unlock(&dqopt->dqio_mutex); goto out_file_init; } + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + dqopt->info[type].dqi_flags |= DQF_SYS_FILE; mutex_unlock(&dqopt->dqio_mutex); spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); @@ -2199,7 +2200,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (error) return error; /* Quota file not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) + if (path->dentry->d_sb != sb) error = -EXDEV; else error = vfs_load_quota_inode(path->dentry->d_inode, type, @@ -2465,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) spin_lock(&dq_data_lock); ii->dqi_bgrace = mi->dqi_bgrace; ii->dqi_igrace = mi->dqi_igrace; - ii->dqi_flags = mi->dqi_flags & DQF_MASK; + ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK; ii->dqi_valid = IIF_ALL; spin_unlock(&dq_data_lock); mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); @@ -2491,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) if (ii->dqi_valid & IIF_IGRACE) mi->dqi_igrace = ii->dqi_igrace; if (ii->dqi_valid & IIF_FLAGS) - mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | - (ii->dqi_flags & DQF_MASK); + mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) | + (ii->dqi_flags & DQF_SETINFO_MASK); spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 35f4b0ecdeb..fc2c4388d12 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -13,7 +13,6 @@ #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> @@ -293,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, } } +/* Return 1 if 'cmd' will block on frozen filesystem */ +static int quotactl_cmd_write(int cmd) +{ + switch (cmd) { + case Q_GETFMT: + case Q_GETINFO: + case Q_SYNC: + case Q_XGETQSTAT: + case Q_XGETQUOTA: + case Q_XQUOTASYNC: + return 0; + } + return 1; +} + /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon */ -static struct super_block *quotactl_block(const char __user *special) +static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK struct block_device *bdev; @@ -310,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); - sb = get_super(bdev); + if (quotactl_cmd_write(cmd)) + sb = get_super_thawed(bdev); + else + sb = get_super(bdev); bdput(bdev); if (!sb) return ERR_PTR(-ENODEV); @@ -362,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, pathp = &path; } - sb = quotactl_block(special); + sb = quotactl_block(special, cmds); if (IS_ERR(sb)) { ret = PTR_ERR(sb); goto out; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 462ceb38fec..aec766abe3a 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -52,7 +52,7 @@ static struct backing_dev_info ramfs_backing_dev_info = { }; struct inode *ramfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev) + const struct inode *dir, umode_t mode, dev_t dev) { struct inode * inode = new_inode(sb); @@ -92,7 +92,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; @@ -106,7 +106,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } -static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) @@ -114,7 +114,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode) return retval; } -static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); } diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index d1aca1df4f9..70de42f09f1 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -13,6 +13,7 @@ #include <linux/reiserfs_fs_sb.h> #include <linux/reiserfs_fs_i.h> #include <linux/quotaops.h> +#include <linux/seq_file.h> #define PREALLOCATION_SIZE 9 @@ -634,6 +635,96 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) return 0; } +static void print_sep(struct seq_file *seq, int *first) +{ + if (!*first) + seq_puts(seq, ":"); + else + *first = 0; +} + +void show_alloc_options(struct seq_file *seq, struct super_block *s) +{ + int first = 1; + + if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | + (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) + return; + + seq_puts(seq, ",alloc="); + + if (TEST_OPTION(concentrating_formatted_nodes, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.border != 10) { + seq_printf(seq, "concentrating_formatted_nodes=%d", + 100 / REISERFS_SB(s)->s_alloc_options.border); + } else + seq_puts(seq, "concentrating_formatted_nodes"); + } + if (TEST_OPTION(displacing_large_files, s)) { + print_sep(seq, &first); + if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { + seq_printf(seq, "displacing_large_files=%lu", + REISERFS_SB(s)->s_alloc_options.large_file_size); + } else + seq_puts(seq, "displacing_large_files"); + } + if (TEST_OPTION(displacing_new_packing_localities, s)) { + print_sep(seq, &first); + seq_puts(seq, "displacing_new_packing_localities"); + } + if (TEST_OPTION(old_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_hashed_relocation"); + } + if (TEST_OPTION(new_hashed_relocation, s)) { + print_sep(seq, &first); + seq_puts(seq, "new_hashed_relocation"); + } + if (TEST_OPTION(dirid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "dirid_groups"); + } + if (TEST_OPTION(oid_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "oid_groups"); + } + if (TEST_OPTION(packing_groups, s)) { + print_sep(seq, &first); + seq_puts(seq, "packing_groups"); + } + if (TEST_OPTION(hashed_formatted_nodes, s)) { + print_sep(seq, &first); + seq_puts(seq, "hashed_formatted_nodes"); + } + if (TEST_OPTION(skip_busy, s)) { + print_sep(seq, &first); + seq_puts(seq, "skip_busy"); + } + if (TEST_OPTION(hundredth_slices, s)) { + print_sep(seq, &first); + seq_puts(seq, "hundredth_slices"); + } + if (TEST_OPTION(old_way, s)) { + print_sep(seq, &first); + seq_puts(seq, "old_way"); + } + if (TEST_OPTION(displace_based_on_dirid, s)) { + print_sep(seq, &first); + seq_puts(seq, "displace_based_on_dirid"); + } + if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { + print_sep(seq, &first); + seq_printf(seq, "preallocmin=%d", + REISERFS_SB(s)->s_alloc_options.preallocmin); + } + if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { + print_sep(seq, &first); + seq_printf(seq, "preallocsize=%d", + REISERFS_SB(s)->s_alloc_options.preallocsize); + } +} + static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; @@ -1273,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); - /* Avoid lock recursion in fault case */ - reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); - reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 950f13af095..9e8cd5acd79 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1766,7 +1766,7 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i for the fresh inode. This can only be done outside a transaction, so if we return non-zero, we also end the transaction. */ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, - struct inode *dir, int mode, const char *symname, + struct inode *dir, umode_t mode, const char *symname, /* 0 for regular, EMTRY_DIR_SIZE for dirs, strlen (symname) for symlinks) */ loff_t i_size, struct dentry *dentry, diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 4e153051bc7..950e3d1b5c9 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -55,7 +55,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) break; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) break; @@ -96,7 +96,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); break; } case REISERFS_IOC_GETVERSION: @@ -107,7 +107,7 @@ setflags_out: err = -EPERM; break; } - err = mnt_want_write(filp->f_path.mnt); + err = mnt_want_write_file(filp); if (err) break; if (get_user(inode->i_generation, (int __user *)arg)) { @@ -117,7 +117,7 @@ setflags_out: inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setversion_out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); break; default: err = -ENOTTY; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index eb711060a6f..c3cf54fd4de 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, char b[BDEVNAME_SIZE]; int ret; - /* - * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS - * dependency inversion warnings. - */ - reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); - reiserfs_write_lock(sb); return 1; } INIT_LIST_HEAD(&journal->j_bitmap_nodes); @@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, - reiserfs_bmap_count(sb)); - reiserfs_write_lock(sb); - if (ret) + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) goto free_and_return; allocate_bitmap_nodes(sb); @@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * We need to unlock here to avoid creating the following - * dependency: - * reiserfs_lock -> sysfs_mutex - * Because the reiserfs mmap path creates the following dependency: - * mm->mmap -> reiserfs_lock, hence we have - * mm->mmap -> reiserfs_lock ->sysfs_mutex - * This would ends up in a circular dependency with sysfs readdir path - * which does sysfs_mutex -> mm->mmap_sem - * This is fine because the reiserfs lock is useless in mount path, - * at least until we call journal_begin. We keep it for paranoid - * reasons. - */ - reiserfs_write_unlock(sb); if (journal_init_dev(sb, journal, j_dev_name) != 0) { - reiserfs_write_lock(sb); reiserfs_warning(sb, "sh-462", "unable to initialize jornal device"); goto free_and_return; } - reiserfs_write_lock(sb); rs = SB_DISK_SUPER_BLOCK(sb); @@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); - reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); - reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; @@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name, init_journal_hash(sb); jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); if (!jl->j_list_bitmap) { reiserfs_warning(sb, "journal-2005", "get_list_bitmap failed for journal list 0"); goto free_and_return; } - if (journal_read(sb) < 0) { + + /* + * Journal_read needs to be inspected in order to push down + * the lock further inside (or even remove it). + */ + reiserfs_write_lock(sb); + ret = journal_read(sb); + reiserfs_write_unlock(sb); + if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); goto free_and_return; } reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) { - reiserfs_write_unlock(sb); + if (reiserfs_mounted_fs_count <= 1) commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - reiserfs_write_lock(sb); - } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; @@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } - /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; th->t_blocks_allocated += new_alloc ; return 0; } -/* this must be called inside a transaction, and requires the -** kernel_lock to be held +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { @@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { @@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 80058e8ce36..14637886523 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -559,7 +559,7 @@ static int drop_new_inode(struct inode *inode) ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ -static int new_inode_init(struct inode *inode, struct inode *dir, int mode) +static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) { /* Make inode invalid - just in case we are going to drop it before * the initialization happens */ @@ -572,7 +572,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) return 0; } -static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, +static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { int retval; @@ -643,7 +643,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, return retval; } -static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; @@ -721,7 +721,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, return retval; } -static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval; struct inode *inode; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 14363b96b6a..e12d8b97cd4 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,6 +28,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> +#include <linux/seq_file.h> struct file_system_type reiserfs_fs_type; @@ -61,6 +62,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) static int reiserfs_remount(struct super_block *s, int *flags, char *data); static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); +void show_alloc_options(struct seq_file *seq, struct super_block *s); static int reiserfs_sync_fs(struct super_block *s, int wait) { @@ -453,16 +455,20 @@ int remove_save_link(struct inode *inode, int truncate) static void reiserfs_kill_sb(struct super_block *s) { if (REISERFS_SB(s)) { - if (REISERFS_SB(s)->xattr_root) { - d_invalidate(REISERFS_SB(s)->xattr_root); - dput(REISERFS_SB(s)->xattr_root); - REISERFS_SB(s)->xattr_root = NULL; - } - if (REISERFS_SB(s)->priv_root) { - d_invalidate(REISERFS_SB(s)->priv_root); - dput(REISERFS_SB(s)->priv_root); - REISERFS_SB(s)->priv_root = NULL; - } + /* + * Force any pending inode evictions to occur now. Any + * inodes to be removed that have extended attributes + * associated with them need to clean them up before + * we can release the extended attribute root dentries. + * shrink_dcache_for_umount will BUG if we don't release + * those before it's called so ->put_super is too late. + */ + shrink_dcache_sb(s); + + dput(REISERFS_SB(s)->xattr_root); + REISERFS_SB(s)->xattr_root = NULL; + dput(REISERFS_SB(s)->priv_root); + REISERFS_SB(s)->priv_root = NULL; } kill_block_super(s); @@ -532,7 +538,6 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) static void reiserfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } @@ -597,6 +602,82 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *s = root->d_sb; + struct reiserfs_journal *journal = SB_JOURNAL(s); + long opts = REISERFS_SB(s)->s_mount_opt; + + if (opts & (1 << REISERFS_LARGETAIL)) + seq_puts(seq, ",tails=on"); + else if (!(opts & (1 << REISERFS_SMALLTAIL))) + seq_puts(seq, ",notail"); + /* tails=small is default so we don't show it */ + + if (!(opts & (1 << REISERFS_BARRIER_FLUSH))) + seq_puts(seq, ",barrier=none"); + /* barrier=flush is default so we don't show it */ + + if (opts & (1 << REISERFS_ERROR_CONTINUE)) + seq_puts(seq, ",errors=continue"); + else if (opts & (1 << REISERFS_ERROR_PANIC)) + seq_puts(seq, ",errors=panic"); + /* errors=ro is default so we don't show it */ + + if (opts & (1 << REISERFS_DATA_LOG)) + seq_puts(seq, ",data=journal"); + else if (opts & (1 << REISERFS_DATA_WRITEBACK)) + seq_puts(seq, ",data=writeback"); + /* data=ordered is default so we don't show it */ + + if (opts & (1 << REISERFS_ATTRS)) + seq_puts(seq, ",attrs"); + + if (opts & (1 << REISERFS_XATTRS_USER)) + seq_puts(seq, ",user_xattr"); + + if (opts & (1 << REISERFS_EXPOSE_PRIVROOT)) + seq_puts(seq, ",expose_privroot"); + + if (opts & (1 << REISERFS_POSIXACL)) + seq_puts(seq, ",acl"); + + if (REISERFS_SB(s)->s_jdev) + seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + + if (journal->j_max_commit_age != journal->j_default_max_commit_age) + seq_printf(seq, ",commit=%d", journal->j_max_commit_age); + +#ifdef CONFIG_QUOTA + if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + else if (opts & (1 << REISERFS_USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + else if (opts & (1 << REISERFS_GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (REISERFS_SB(s)->s_jquota_fmt) { + if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD) + seq_puts(seq, ",jqfmt=vfsold"); + else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0) + seq_puts(seq, ",jqfmt=vfsv0"); + } +#endif + + /* Block allocator options */ + if (opts & (1 << REISERFS_NO_BORDER)) + seq_puts(seq, ",block-allocator=noborder"); + if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=no_unhashed_relocation"); + if (opts & (1 << REISERFS_HASHED_RELOCATION)) + seq_puts(seq, ",block-allocator=hashed_relocation"); + if (opts & (1 << REISERFS_TEST4)) + seq_puts(seq, ",block-allocator=test4"); + show_alloc_options(seq, s); + return 0; +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -617,7 +698,7 @@ static const struct super_operations reiserfs_sops = { .unfreeze_fs = reiserfs_unfreeze, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, - .show_options = generic_show_options, + .show_options = reiserfs_show_options, #ifdef CONFIG_QUOTA .quota_read = reiserfs_quota_read, .quota_write = reiserfs_quota_write, @@ -915,9 +996,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin {"jdev",.arg_required = 'j',.values = NULL}, {"nolargeio",.arg_required = 'w',.values = NULL}, {"commit",.arg_required = 'c',.values = NULL}, - {"usrquota",.setmask = 1 << REISERFS_QUOTA}, - {"grpquota",.setmask = 1 << REISERFS_QUOTA}, - {"noquota",.clrmask = 1 << REISERFS_QUOTA}, + {"usrquota",.setmask = 1 << REISERFS_USRQUOTA}, + {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA}, + {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA}, {"errors",.arg_required = 'e',.values = error_actions}, {"usrjquota",.arg_required = 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, @@ -1031,12 +1112,19 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin return 0; } strcpy(qf_names[qtype], arg); - *mount_options |= 1 << REISERFS_QUOTA; + if (qtype == USRQUOTA) + *mount_options |= 1 << REISERFS_USRQUOTA; + else + *mount_options |= 1 << REISERFS_GRPQUOTA; } else { if (qf_names[qtype] != REISERFS_SB(s)->s_qf_names[qtype]) kfree(qf_names[qtype]); qf_names[qtype] = NULL; + if (qtype == USRQUOTA) + *mount_options &= ~(1 << REISERFS_USRQUOTA); + else + *mount_options &= ~(1 << REISERFS_GRPQUOTA); } } if (c == 'f') { @@ -1075,9 +1163,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "journaled quota format not specified."); return 0; } - /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ - if (!(*mount_options & (1 << REISERFS_QUOTA)) - && sb_any_quota_loaded(s)) { + if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) && + sb_has_quota_loaded(s, USRQUOTA)) || + (!(*mount_options & (1 << REISERFS_GRPQUOTA)) && + sb_has_quota_loaded(s, GRPQUOTA))) { reiserfs_warning(s, "super-6516", "quota options must " "be present when quota is turned on."); return 0; @@ -1164,7 +1253,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names, kfree(REISERFS_SB(s)->s_qf_names[i]); REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; } - REISERFS_SB(s)->s_jquota_fmt = *qfmt; + if (*qfmt) + REISERFS_SB(s)->s_jquota_fmt = *qfmt; } #endif @@ -1225,7 +1315,8 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) safe_mask |= 1 << REISERFS_ERROR_RO; safe_mask |= 1 << REISERFS_ERROR_CONTINUE; safe_mask |= 1 << REISERFS_ERROR_PANIC; - safe_mask |= 1 << REISERFS_QUOTA; + safe_mask |= 1 << REISERFS_USRQUOTA; + safe_mask |= 1 << REISERFS_GRPQUOTA; /* Update the bitmask, taking care to keep * the bits we're not allowed to change here */ @@ -1428,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); - reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); - reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1655,22 +1744,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&REISERFS_SB(s)->lock); REISERFS_SB(s)->lock_depth = -1; - /* - * This function is called with the bkl, which also was the old - * locking used here. - * do_journal_begin() will soon check if we hold the lock (ie: was the - * bkl). This is likely because do_journal_begin() has several another - * callers because at this time, it doesn't seem to be necessary to - * protect against anything. - * Anyway, let's be conservative and lock for now. - */ - reiserfs_write_lock(s); - jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { - goto error; + goto error_unlocked; + } + if (jdev_name && jdev_name[0]) { + REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); + if (!REISERFS_SB(s)->s_jdev) { + SWARN(silent, s, "", "Cannot allocate memory for " + "journal device name"); + goto error; + } } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1678,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (blocks) { SWARN(silent, s, "jmacd-7", "resize option for remount only"); - goto error; + goto error_unlocked; } /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ @@ -1688,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", reiserfs_bdevname(s)); - goto error; + goto error_unlocked; } rs = SB_DISK_SUPER_BLOCK(s); @@ -1704,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "or increase size of your LVM partition"); SWARN(silent, s, "", "Or may be you forgot to " "reboot after fdisk when it told you to"); - goto error; + goto error_unlocked; } sbi->s_mount_state = SB_REISERFS_STATE(s); @@ -1712,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if ((errval = reiserfs_init_bitmap_cache(s))) { SWARN(silent, s, "jmacd-8", "unable to read bitmap"); - goto error; + goto error_unlocked; } + errval = -EINVAL; #ifdef CONFIG_REISERFS_CHECK SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); @@ -1736,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (reiserfs_barrier_flush(s)) { printk("reiserfs: using flush barriers\n"); } + // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", "unable to initialize journal space"); - goto error; + goto error_unlocked; } else { jinit_done = 1; /* once this is set, journal_release must be called ** if we error out of the mount */ } + if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); - goto error; + goto error_unlocked; } if (replay_only(s)) - goto error; + goto error_unlocked; if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { SWARN(silent, s, "clm-7000", @@ -1767,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); - goto error; + goto error_unlocked; } + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + if (root_inode->i_state & I_NEW) { reiserfs_read_locked_inode(root_inode, &args); unlock_new_inode(root_inode); @@ -1896,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); error: - if (jinit_done) { /* kill the commit thread, free journal ram */ + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); journal_release_error(NULL, s); + reiserfs_write_unlock(s); } - reiserfs_write_unlock(s); - reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); @@ -2054,12 +2157,13 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, int err; struct inode *inode; struct reiserfs_transaction_handle th; + int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) return -EINVAL; /* Quotafile not on the same filesystem? */ - if (path->mnt->mnt_sb != sb) { + if (path->dentry->d_sb != sb) { err = -EXDEV; goto out; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6bc346c160e..c24deda8a8b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,7 +66,7 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) } #endif -static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); return dir->i_op->mkdir(dir, dentry, mode); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index eed99428f10..e1a7779dd3c 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file, struct inode *inode = file->f_mapping->host; struct mtd_info *mtd = inode->i_sb->s_mtd; unsigned long isize, offset, maxpages, lpages; + int ret; if (!mtd) - goto cant_map_directly; + return (unsigned long) -ENOSYS; /* the mapping mustn't extend beyond the EOF */ lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file, if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) return (unsigned long) -EINVAL; - /* we need to call down to the MTD layer to do the actual mapping */ - if (mtd->get_unmapped_area) { - if (addr != 0) - return (unsigned long) -EINVAL; - - if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) - return (unsigned long) -EINVAL; + if (addr != 0) + return (unsigned long) -EINVAL; - offset += ROMFS_I(inode)->i_dataoffset; - if (offset > mtd->size - len) - return (unsigned long) -EINVAL; + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; - return mtd->get_unmapped_area(mtd, len, offset, flags); - } + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; -cant_map_directly: - return (unsigned long) -ENOSYS; + ret = mtd_get_unmapped_area(mtd, len, offset, flags); + if (ret == -EOPNOTSUPP) + ret = -ENOSYS; + return (unsigned long) ret; } /* diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 8b4089f3040..bb36ab74eb4 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -403,7 +403,6 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) static void romfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } diff --git a/fs/select.c b/fs/select.c index d33418fdc85..e782258d0de 100644 --- a/fs/select.c +++ b/fs/select.c @@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block) } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, - long, timeout_msecs) + int, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; diff --git a/fs/seq_file.c b/fs/seq_file.c index dba43c3ea3a..4023d6be939 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -397,7 +397,7 @@ EXPORT_SYMBOL(seq_printf); * Returns pointer past last written character in @s, or NULL in case of * failure. */ -char *mangle_path(char *s, char *p, char *esc) +char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; @@ -427,7 +427,7 @@ EXPORT_SYMBOL(mangle_path); * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ -int seq_path(struct seq_file *m, struct path *path, char *esc) +int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); @@ -450,8 +450,8 @@ EXPORT_SYMBOL(seq_path); /* * Same as seq_path, but relative to supplied root. */ -int seq_path_root(struct seq_file *m, struct path *path, struct path *root, - char *esc) +int seq_path_root(struct seq_file *m, const struct path *path, + const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); @@ -480,7 +480,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, /* * returns the path of the 'dentry' from the root of its filesystem. */ -int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) +int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451d..7ae2a574cb2 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,21 @@ #include <linux/signalfd.h> #include <linux/syscalls.h> +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/fs/splice.c b/fs/splice.c index fa2defa8afc..1ec0493266b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -25,7 +25,6 @@ #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/buffer_head.h> #include <linux/module.h> #include <linux/syscalls.h> #include <linux/uio.h> diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f744be98cd5..af0b7380259 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, spin_lock(&cache->lock); while (1) { - for (i = 0; i < cache->entries; i++) - if (cache->entry[i].block == block) + for (i = cache->curr_blk, n = 0; n < cache->entries; n++) { + if (cache->entry[i].block == block) { + cache->curr_blk = i; break; + } + i = (i + 1) % cache->entries; + } - if (i == cache->entries) { + if (n == cache->entries) { /* * Block not in cache, if all cache entries are used * go to sleep waiting for one to become available. @@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } + cache->curr_blk = 0; cache->next_blk = 0; cache->unused = entries; cache->entries = entries; @@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, u64 *block, int *offset, int length) { struct squashfs_sb_info *msblk = sb->s_fs_info; - int bytes, copied = length; + int bytes, res = length; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); while (length) { entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); - if (entry->error) - return entry->error; - else if (*offset >= entry->length) - return -EIO; + if (entry->error) { + res = entry->error; + goto error; + } else if (*offset >= entry->length) { + res = -EIO; + goto error; + } bytes = squashfs_copy_data(buffer, entry, *offset, length); if (buffer) @@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer, squashfs_cache_put(entry); } - return copied; + return res; + +error: + squashfs_cache_put(entry); + return res; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index fd7b3b3bda1..81afbccfa84 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; - inode->i_blocks = ((inode->i_size - - le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + inode->i_blocks = (inode->i_size - + le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 651f0b31d29..52934a22f29 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -28,6 +28,7 @@ struct squashfs_cache { char *name; int entries; + int curr_blk; int next_blk; int num_waiters; int unused; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 2da1715452a..ecaa2f7bdb8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -290,7 +290,7 @@ handle_fragments: check_directory_table: /* Sanity check directory_table */ - if (msblk->directory_table >= next_table) { + if (msblk->directory_table > next_table) { err = -EINVAL; goto failed_mount; } @@ -464,7 +464,6 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb) static void squashfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } diff --git a/fs/statfs.c b/fs/statfs.c index 9cf04a11896..2aa6a22e0be 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -7,6 +7,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> +#include "internal.h" static int flags_by_mnt(int mnt_flags) { @@ -45,7 +46,7 @@ static int calculate_f_flags(struct vfsmount *mnt) flags_by_sb(mnt->mnt_sb->s_flags); } -int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) +static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) { int retval; @@ -205,19 +206,23 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user return error; } -SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; + struct super_block *s = user_get_super(dev); int err; - - s = user_get_super(new_decode_dev(dev)); if (!s) return -EINVAL; - err = statfs_by_dentry(s->s_root, &sbuf); + err = statfs_by_dentry(s->s_root, sbuf); drop_super(s); + return err; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); if (err) return err; diff --git a/fs/super.c b/fs/super.c index afd0f1ad45e..6277ec6cb60 100644 --- a/fs/super.c +++ b/fs/super.c @@ -136,12 +136,13 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_files); #endif s->s_bdi = &default_backing_dev_info; - INIT_LIST_HEAD(&s->s_instances); + INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); INIT_LIST_HEAD(&s->s_inode_lru); spin_lock_init(&s->s_inode_lru_lock); + INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -200,6 +201,7 @@ static inline void destroy_super(struct super_block *s) free_percpu(s->s_files); #endif security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); kfree(s); @@ -210,7 +212,7 @@ static inline void destroy_super(struct super_block *s) /* * Drop a superblock's refcount. The caller must hold sb_lock. */ -void __put_super(struct super_block *sb) +static void __put_super(struct super_block *sb) { if (!--sb->s_count) { list_del_init(&sb->s_list); @@ -225,7 +227,7 @@ void __put_super(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -void put_super(struct super_block *sb) +static void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -328,7 +330,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock) bool grab_super_passive(struct super_block *sb) { spin_lock(&sb_lock); - if (list_empty(&sb->s_instances)) { + if (hlist_unhashed(&sb->s_instances)) { spin_unlock(&sb_lock); return false; } @@ -337,7 +339,7 @@ bool grab_super_passive(struct super_block *sb) spin_unlock(&sb_lock); if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } @@ -400,7 +402,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_instances); + hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -420,13 +422,14 @@ struct super_block *sget(struct file_system_type *type, void *data) { struct super_block *s = NULL; + struct hlist_node *node; struct super_block *old; int err; retry: spin_lock(&sb_lock); if (test) { - list_for_each_entry(old, &type->fs_supers, s_instances) { + hlist_for_each_entry(old, node, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) @@ -462,7 +465,7 @@ retry: s->s_type = type; strlcpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); - list_add(&s->s_instances, &type->fs_supers); + hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); register_shrinker(&s->s_shrink); @@ -497,14 +500,14 @@ void sync_supers(void) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt) + if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN)) sb->s_op->write_super(sb); up_read(&sb->s_umount); @@ -533,13 +536,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -566,14 +569,15 @@ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; + struct hlist_node *node; spin_lock(&sb_lock); - list_for_each_entry(sb, &type->fs_supers, s_instances) { + hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -607,14 +611,14 @@ struct super_block *get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -630,6 +634,28 @@ rescan: EXPORT_SYMBOL(get_super); /** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + while (1) { + struct super_block *s = get_super(bdev); + if (!s || s->s_frozen == SB_UNFROZEN) + return s; + up_read(&s->s_umount); + vfs_check_frozen(s, SB_FREEZE_WRITE); + put_super(s); + } +} +EXPORT_SYMBOL(get_super_thawed); + +/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -647,7 +673,7 @@ struct super_block *get_active_super(struct block_device *bdev) restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { if (grab_super(sb)) /* drops sb_lock */ @@ -667,14 +693,14 @@ struct super_block *user_get_super(dev_t dev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -719,23 +745,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { - if (force) + if (force) { mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) - return -EBUSY; + } else { + retval = sb_prepare_remount_readonly(sb); + if (retval) + return retval; + } } if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); if (retval) { if (!force) - return retval; + goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + /* Needs to be ordered wrt mnt_is_readonly() */ + smp_wmb(); + sb->s_readonly_remount = 0; /* * Some filesystems modify their metadata via some other path than the @@ -748,6 +780,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; + +cancel_readonly: + sb->s_readonly_remount = 0; + return retval; } static void do_emergency_remount(struct work_struct *work) @@ -756,12 +792,13 @@ static void do_emergency_remount(struct work_struct *work) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { + if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) && + !(sb->s_flags & MS_RDONLY)) { /* * What lock protects sb->s_flags?? */ @@ -1144,6 +1181,11 @@ int freeze_super(struct super_block *sb) return -EBUSY; } + if (!(sb->s_flags & MS_BORN)) { + up_write(&sb->s_umount); + return 0; /* sic - it's "nothing to do" */ + } + if (sb->s_flags & MS_RDONLY) { sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); @@ -1166,6 +1208,8 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); deactivate_locked_super(sb); return ret; } diff --git a/fs/sync.c b/fs/sync.c index 101b8ef901d..f3501ef3923 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -14,7 +14,6 @@ #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/buffer_head.h> #include <linux/backing-dev.h> #include "internal.h" diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d4e6080b4b2..00012e31829 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, const void *ns = NULL; int err; + if (!dir_sd) { + WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n", + kobject_name(kobj)); + return -ENOENT; + } + err = 0; if (!sysfs_ns_type(dir_sd)) goto out; @@ -518,7 +524,7 @@ out: } int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, mode_t amode) + const struct attribute *attr, int type, umode_t amode) { umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; @@ -618,7 +624,7 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); * */ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, - mode_t mode) + umode_t mode) { struct sysfs_dirent *sd; struct iattr newattrs; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 194414f8298..dd1701caecc 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -33,7 +33,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, int error = 0, i; for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { - mode_t mode = 0; + umode_t mode = 0; /* in update mode, we're changing the permissions or * visibility. Do this by first removing then diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index c81b22f3ace..85eb81683a2 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -187,7 +187,7 @@ out: return error; } -static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; - if (!dir_sd) + if (!dir_sd) { + WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", + name); return -ENOENT; + } sysfs_addrm_start(&acxt, dir_sd); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ce29e28b766..7484a36ee67 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -79,7 +79,7 @@ struct sysfs_dirent { }; unsigned int s_flags; - unsigned short s_mode; + umode_t s_mode; ino_t s_ino; struct sysfs_inode_attrs *s_iattr; }; @@ -229,7 +229,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type); int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, mode_t amode); + const struct attribute *attr, int type, umode_t amode); /* * bin.c */ diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 0c96c98bd1d..8233b02ecca 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -132,7 +132,7 @@ void sysv_free_inode(struct inode * inode) brelse(bh); } -struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) +struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 25ffb3e9a3f..3da5ce25faf 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -336,7 +336,6 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) static void sysv_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index fa8d43c92bb..90b54b43878 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -442,7 +442,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct super_block *s = mnt->mnt_sb; + struct super_block *s = dentry->d_sb; generic_fillattr(dentry->d_inode, stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index e474fbcf8bd..b217797e621 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -61,7 +61,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st return NULL; } -static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_t rdev) +static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -80,7 +80,7 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_ return err; } -static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) +static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { return sysv_mknod(dir, dentry, mode, 0); } @@ -131,7 +131,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode) +static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index bb55cdb394b..0e4b821c569 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -125,7 +125,7 @@ static inline void dirty_sb(struct super_block *sb) /* ialloc.c */ extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, struct buffer_head **); -extern struct inode * sysv_new_inode(const struct inode *, mode_t); +extern struct inode * sysv_new_inode(const struct inode *, umode_t); extern void sysv_free_inode(struct inode *); extern unsigned long sysv_count_free_inodes(struct super_block *); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index b09ba2dd8b6..f922cbacdb9 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -38,9 +38,6 @@ DEFINE_SPINLOCK(dbg_lock); -static char dbg_key_buf0[128]; -static char dbg_key_buf1[128]; - static const char *get_key_fmt(int fmt) { switch (fmt) { @@ -103,8 +100,8 @@ static const char *get_dent_type(int type) } } -static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, - char *buffer) +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len) { char *p = buffer; int type = key_type(c, key); @@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { switch (type) { case UBIFS_INO_KEY: - sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; case UBIFS_DENT_KEY: case UBIFS_XENT_KEY: - sprintf(p, "(%lu, %s, %#08x)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_hash(c, key)); + len -= snprintf(p, len, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); break; case UBIFS_DATA_KEY: - sprintf(p, "(%lu, %s, %u)", - (unsigned long)key_inum(c, key), - get_key_type(type), key_block(c, key)); + len -= snprintf(p, len, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); break; case UBIFS_TRUN_KEY: - sprintf(p, "(%lu, %s)", - (unsigned long)key_inum(c, key), - get_key_type(type)); + len -= snprintf(p, len, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); break; default: - sprintf(p, "(bad key type: %#08x, %#08x)", - key->u32[0], key->u32[1]); + len -= snprintf(p, len, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); } } else - sprintf(p, "bad key format %d", c->key_fmt); -} - -const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf0); - return dbg_key_buf0; -} - -const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) -{ - /* dbg_lock must be held */ - sprintf_key(c, key, dbg_key_buf1); - return dbg_key_buf1; + len -= snprintf(p, len, "bad key format %d", c->key_fmt); + ubifs_assert(len > 0); + return p; } const char *dbg_ntype(int type) @@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int i, n; union ubifs_key key; const struct ubifs_ch *ch = node; + char key_buf[DBG_KEY_BUF_LEN]; if (dbg_is_tst_rcvry(c)) return; @@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) const struct ubifs_ino_node *ino = node; key_read(c, &ino->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tcreat_sqnum %llu\n", (unsigned long long)le64_to_cpu(ino->creat_sqnum)); printk(KERN_DEBUG "\tsize %llu\n", @@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int nlen = le16_to_cpu(dent->nlen); key_read(c, &dent->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tinum %llu\n", (unsigned long long)le64_to_cpu(dent->inum)); printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); @@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; key_read(c, &dn->key, &key); - printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tkey %s\n", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); printk(KERN_DEBUG "\tsize %u\n", le32_to_cpu(dn->size)); printk(KERN_DEBUG "\tcompr_typ %d\n", @@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) key_read(c, &br->key, &key); printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), - le32_to_cpu(br->len), DBGKEY(&key)); + le32_to_cpu(br->len), + dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); } break; } @@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c, { int n; const struct ubifs_zbranch *zbr; + char key_buf[DBG_KEY_BUF_LEN]; spin_lock(&dbg_lock); if (znode->parent) @@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c, printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); else printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " "%s\n", n, zbr->znode, zbr->lnum, zbr->offs, zbr->len, - DBGKEY(&zbr->key)); + dbg_snprintf_key(c, &zbr->key, + key_buf, + DBG_KEY_BUF_LEN)); } spin_unlock(&dbg_lock); } @@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, int err, nlen1, nlen2, cmp; struct ubifs_dent_node *dent1, *dent2; union ubifs_key key; + char key_buf[DBG_KEY_BUF_LEN]; ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); @@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr1->key)); + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent1); goto out_free; } @@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, DBGKEY(&key)); + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); dbg_err("but it should have key %s according to tnc", - DBGKEY(&zbr2->key)); + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); dbg_dump_node(c, dent2); goto out_free; } @@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_err("2 xent/dent nodes with the same name"); else dbg_err("bad order of colliding key %s", - DBGKEY(&key)); + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); dbg_dump_node(c, dent1); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 8d9c4681018..ad1a6fee601 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -169,40 +169,39 @@ struct ubifs_global_debug_info { spin_unlock(&dbg_lock); \ } while (0) -const char *dbg_key_str0(const struct ubifs_info *c, - const union ubifs_key *key); -const char *dbg_key_str1(const struct ubifs_info *c, - const union ubifs_key *key); - -/* - * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message - * macros. - */ -#define DBGKEY(key) dbg_key_str0(c, (key)) -#define DBGKEY1(key) dbg_key_str1(c, (key)) - -extern spinlock_t dbg_lock; - -#define ubifs_dbg_msg(type, fmt, ...) do { \ - spin_lock(&dbg_lock); \ - pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ +#define ubifs_dbg_msg(type, fmt, ...) \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) + +#define DBG_KEY_BUF_LEN 32 +#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \ + char __tmp_key_buf[DBG_KEY_BUF_LEN]; \ + pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \ + dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \ } while (0) /* Just a debugging messages not related to any specific UBIFS subsystem */ -#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) + /* General messages */ #define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ #define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) \ + ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__) /* Additional TNC messages */ #define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) \ + ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__) /* Additional lprops messages */ #define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ #define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ #define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) \ + ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__) /* Additional I/O messages */ #define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ @@ -218,6 +217,7 @@ extern spinlock_t dbg_lock; /* Additional recovery messages */ #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) +extern spinlock_t dbg_lock; extern struct ubifs_global_debug_info ubifs_dbg; static inline int dbg_is_chk_gen(const struct ubifs_info *c) @@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state); const char *dbg_jhead(int jhead); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); +const char *dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, int len); void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, @@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define dbg_dump_stack() #define ubifs_assert_cmt_locked(c) -#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } @@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; } static inline const char * dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key) { return ""; } +static inline const char * +dbg_snprintf_key(const struct ubifs_info *c, + const union ubifs_key *key, char *buffer, + int len) { return ""; } static inline void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) { return; } static inline void dbg_dump_node(const struct ubifs_info *c, diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 68349204331..d6fe1c79f18 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -56,7 +56,7 @@ * * This function returns the inherited flags. */ -static int inherit_flags(const struct inode *dir, int mode) +static int inherit_flags(const struct inode *dir, umode_t mode) { int flags; const struct ubifs_inode *ui = ubifs_inode(dir); @@ -86,7 +86,7 @@ static int inherit_flags(const struct inode *dir, int mode) * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode) + umode_t mode) { struct inode *inode; struct ubifs_inode *ui; @@ -253,7 +253,7 @@ out: return ERR_PTR(err); } -static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, +static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -268,7 +268,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, * parent directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -712,7 +712,7 @@ out_cancel: return err; } -static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -725,7 +725,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) * directory inode. */ - dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dbg_gen("dent '%.*s', mode %#hx in dir ino %lu", dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -769,7 +769,7 @@ out_budg: } static int ubifs_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { struct inode *inode; struct ubifs_inode *ui; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 548acf494af..1a7e2d8bdbe 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -173,12 +173,12 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * Make sure the file-system is read-write and make sure it * will not become read-only while we are changing the flags. */ - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) return err; dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); err = setflags(inode, flags); - mnt_drop_write(file->f_path.mnt); + mnt_drop_write_file(file); return err; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index cef0460f4c5..2f438ab2e7a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_jnl("ino %lu, blk %u, len %d, key %s", - (unsigned long)key_inum(c, key), key_block(c, key), len, - DBGKEY(key)); + dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", + (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); @@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, dn = (void *)trun + UBIFS_TRUN_NODE_SZ; blk = new_size >> UBIFS_BLOCK_SHIFT; data_key_init(c, &key, inum, blk); - dbg_jnl("last block key %s", DBGKEY(&key)); + dbg_jnlk(&key, "last block key "); err = ubifs_tnc_lookup(c, &key, dn); if (err == -ENOENT) dlen = 0; /* Not found (so it is a hole) */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 6189c74d97f..66d59d0a140 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1986,12 +1986,11 @@ again: if (path[h].in_tree) continue; - nnode = kmalloc(sz, GFP_NOFS); + nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS); if (!nnode) { err = -ENOMEM; goto out; } - memcpy(nnode, &path[h].nnode, sz); parent = nnode->parent; parent->nbranch[nnode->iip].nnode = nnode; path[h].ptr.nnode = nnode; @@ -2004,12 +2003,11 @@ again: const size_t sz = sizeof(struct ubifs_pnode); struct ubifs_nnode *parent; - pnode = kmalloc(sz, GFP_NOFS); + pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS); if (!pnode) { err = -ENOMEM; goto out; } - memcpy(pnode, &path[h].pnode, sz); parent = pnode->parent; parent->nbranch[pnode->iip].pnode = pnode; path[h].ptr.pnode = pnode; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ccabaf1164b..b007637f040 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { int err; - dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); + dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ", + r->lnum, r->offs, r->len, r->deletion, r->sqnum); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; @@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, { struct replay_entry *r; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); @@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, struct replay_entry *r; char *nbuf; - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ae0e76bb6eb..63765d58445 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -276,7 +276,6 @@ static void ubifs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct ubifs_inode *ui = ubifs_inode(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ubifs_inode_slab, ui); } @@ -420,9 +419,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +static int ubifs_show_options(struct seq_file *s, struct dentry *root) { - struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_printf(s, ",fast_unmount"); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 06673864768..16ad84d8402 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, return err; } - lnc_node = kmalloc(zbr->len, GFP_NOFS); + lnc_node = kmemdup(node, zbr->len, GFP_NOFS); if (!lnc_node) /* We don't have to have the cache, so no error */ return 0; - memcpy(lnc_node, node, zbr->len); zbr->leaf = lnc_node; return 0; } @@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, { int ret; - dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs); ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, zbr->offs); @@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, ret = 0; } if (ret == 0 && c->replaying) - dbg_mnt("dangling branch LEB %d:%d len %d, key %s", - zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ", + zbr->lnum, zbr->offs, zbr->len); return ret; } @@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c, if (adding || !o_znode) return 0; - dbg_mnt("dangling match LEB %d:%d len %d %s", + dbg_mntk(key, "dangling match LEB %d:%d len %d key ", o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, - o_znode->zbranch[o_n].len, DBGKEY(key)); + o_znode->zbranch[o_n].len); *zn = o_znode; *n = o_n; return 1; @@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search key %s", DBGKEY(key)); + dbg_tnck(key, "search key "); ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); znode = c->zroot.znode; @@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; unsigned long time = get_seconds(); - dbg_tnc("search and dirty key %s", DBGKEY(key)); + dbg_tnck(key, "search and dirty key "); znode = c->zroot.znode; if (unlikely(!znode)) { @@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf, if (!keys_eq(c, &zbr->key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(&zbr->key), DBGKEY1(&key1)); + dbg_tnck(&zbr->key, "looked for key "); + dbg_tnck(&key1, "found node's key "); goto out_err; } @@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) ubifs_err("failed to read from LEB %d:%d, error %d", lnum, offs, err); dbg_dump_stack(); - dbg_tnc("key %s", DBGKEY(&bu->key)); + dbg_tnck(&bu->key, "key "); return err; } @@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, int found, n, err; struct ubifs_znode *znode; - dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1986,8 +1985,7 @@ again: zp = znode->parent; if (znode->child_cnt < c->fanout) { ubifs_assert(n != c->fanout); - dbg_tnc("inserted at %d level %d, key %s", n, znode->level, - DBGKEY(key)); + dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level); insert_zbranch(znode, zbr, n); @@ -2002,7 +2000,7 @@ again: * Unfortunately, @znode does not have more empty slots and we have to * split it. */ - dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + dbg_tnck(key, "splitting level %d, key ", znode->level); if (znode->alt) /* @@ -2096,7 +2094,7 @@ do_split: } /* Insert new key and branch */ - dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level); insert_zbranch(zi, zbr, n); @@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (!found) { struct ubifs_zbranch zbr; @@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, - old_offs, lnum, offs, len, DBGKEY(key)); + dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum, + old_offs, lnum, offs, len); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, - DBGKEY(key)); + dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) /* Delete without merge for now */ ubifs_assert(znode->level == 0); ubifs_assert(n >= 0 && n < c->fanout); - dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + dbg_tnck(&znode->zbranch[n].key, "deleting key "); zbr = &znode->zbranch[n]; lnc_free(zbr); @@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, dbg_dump_znode(c, znode); goto out_unlock; } - dbg_tnc("removing %s", DBGKEY(key)); + dbg_tnck(key, "removing key "); } if (k) { for (i = n + 1 + k; i < znode->child_cnt; i++) @@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err("inode %lu has size %lld, but there are data at offset %lld " - "(data key %s)", (unsigned long)inode->i_ino, size, - ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); + ubifs_err("inode %lu has size %lld, but there are data at offset %lld", + (unsigned long)inode->i_ino, size, + ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); dbg_dump_inode(c, inode); dbg_dump_stack(); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index b48db999903..dc28fe6ec07 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - dbg_msg("bad key type at slot %d: %s", i, - DBGKEY(&zbr->key)); + dbg_msg("bad key type at slot %d: %d", + i, key_type(c, &zbr->key)); err = 3; goto out_dump; } @@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->offs); if (err) { - dbg_tnc("key %s", DBGKEY(key)); + dbg_tnck(key, "key "); return err; } @@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_tnc("looked for key %s found node's key %s", - DBGKEY(key), DBGKEY1(&key1)); + dbg_tnck(key, "looked for key "); + dbg_tnck(&key1, "but found node's key "); dbg_dump_node(c, node); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 27f22551f80..12e94774aa8 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1734,7 +1734,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, - int mode); + umode_t mode); int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index bf18f7a0454..85b27226875 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui = ubifs_inode(inode); ui->xattr = 1; ui->flags |= UBIFS_XATTR_FL; - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; @@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, return err; kfree(ui->data); - ui->data = kmalloc(size, GFP_NOFS); + ui->data = kmemdup(value, size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; diff --git a/fs/udf/file.c b/fs/udf/file.c index d8ffa7cc661..d567b8448df 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, err = udf_expand_file_adinicb(inode); if (err) { udf_debug("udf_expand_adinicb: err=%d\n", err); - up_write(&iinfo->i_data_sem); return err; } } else { @@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, iinfo->i_lenAlloc = pos + count; else iinfo->i_lenAlloc = inode->i_size; + up_write(&iinfo->i_data_sem); } - } - up_write(&iinfo->i_data_sem); + } else + up_write(&iinfo->i_data_sem); retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); if (retval > 0) @@ -201,12 +201,10 @@ out: static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { - mutex_lock(&inode->i_mutex); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6fb7e0adcda..05ab48195be 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -46,7 +46,7 @@ void udf_free_inode(struct inode *inode) udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } -struct inode *udf_new_inode(struct inode *dir, int mode, int *err) +struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4fd1d809738..7699df7b319 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -48,13 +48,12 @@ MODULE_LICENSE("GPL"); #define EXTENT_MERGE_SIZE 5 -static mode_t udf_convert_permissions(struct fileEntry *); +static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static void udf_fill_inode(struct inode *, struct buffer_head *); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); -static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, - sector_t *, int *); +static sector_t inode_getblk(struct inode *, sector_t, int *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, @@ -151,6 +150,12 @@ const struct address_space_operations udf_aops = { .bmap = udf_bmap, }; +/* + * Expand file stored in ICB to a normal one-block-file + * + * This function requires i_data_sem for writing and releases it. + * This function requires i_mutex held + */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; @@ -169,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } + /* + * Release i_data_sem so that we can lock a page - page lock ranks + * above i_data_sem. i_mutex still protects us against file changes. + */ + up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) @@ -187,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode) SetPageUptodate(page); kunmap(page); } + down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; @@ -196,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; + up_write(&iinfo->i_data_sem); err = inode->i_data.a_ops->writepage(page, &udf_wbc); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); kaddr = kmap(page); + down_write(&iinfo->i_data_sem); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap(page); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; + up_write(&iinfo->i_data_sem); } page_cache_release(page); mark_inode_dirty(inode); @@ -310,7 +325,6 @@ static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int err, new; - struct buffer_head *bh; sector_t phys = 0; struct udf_inode_info *iinfo; @@ -323,7 +337,6 @@ static int udf_get_block(struct inode *inode, sector_t block, err = -EIO; new = 0; - bh = NULL; iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); @@ -332,13 +345,10 @@ static int udf_get_block(struct inode *inode, sector_t block, iinfo->i_next_alloc_goal++; } - err = 0; - bh = inode_getblk(inode, block, &err, &phys, &new); - BUG_ON(bh); - if (err) + phys = inode_getblk(inode, block, &err, &new); + if (!phys) goto abort; - BUG_ON(!phys); if (new) set_buffer_new(bh_result); @@ -547,11 +557,10 @@ out: return err; } -static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, - int *err, sector_t *phys, int *new) +static sector_t inode_getblk(struct inode *inode, sector_t block, + int *err, int *new) { static sector_t last_block; - struct buffer_head *result = NULL; struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; @@ -566,6 +575,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; + *err = 0; + *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; @@ -635,8 +646,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, brelse(cur_epos.bh); brelse(next_epos.bh); newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); - *phys = newblock; - return NULL; + return newblock; } last_block = block; @@ -664,7 +674,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, brelse(cur_epos.bh); brelse(next_epos.bh); *err = ret; - return NULL; + return 0; } c = 0; offset = 0; @@ -729,7 +739,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, if (!newblocknum) { brelse(prev_epos.bh); *err = -ENOSPC; - return NULL; + return 0; } iinfo->i_lenExtents += inode->i_sb->s_blocksize; } @@ -761,10 +771,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, newblock = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); - if (!newblock) - return NULL; - *phys = newblock; - *err = 0; + if (!newblock) { + *err = -EIO; + return 0; + } *new = 1; iinfo->i_next_alloc_block = block; iinfo->i_next_alloc_goal = newblocknum; @@ -775,7 +785,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, else mark_inode_dirty(inode); - return result; + return newblock; } static void udf_split_extents(struct inode *inode, int *c, int offset, @@ -1111,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize) if (bsize < (udf_file_entry_alloc_offset(inode) + newsize)) { err = udf_expand_file_adinicb(inode); - if (err) { - up_write(&iinfo->i_data_sem); + if (err) return err; - } + down_write(&iinfo->i_data_sem); } else iinfo->i_lenAlloc = newsize; } @@ -1452,9 +1461,9 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) return 0; } -static mode_t udf_convert_permissions(struct fileEntry *fe) +static umode_t udf_convert_permissions(struct fileEntry *fe) { - mode_t mode; + umode_t mode; uint32_t permissions; uint32_t flags; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 4639e137222..08bf46edf9c 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -552,7 +552,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } -static int udf_create(struct inode *dir, struct dentry *dentry, int mode, +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd) { struct udf_fileident_bh fibh; @@ -596,7 +596,7 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, return 0; } -static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -640,7 +640,7 @@ out: return err; } -static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_bh fibh; diff --git a/fs/udf/super.c b/fs/udf/super.c index e185253470d..c09a84daaf5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -89,7 +89,7 @@ static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); -static int udf_show_options(struct seq_file *, struct vfsmount *); +static int udf_show_options(struct seq_file *, struct dentry *); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -138,7 +138,6 @@ static struct inode *udf_alloc_inode(struct super_block *sb) static void udf_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } @@ -196,11 +195,11 @@ struct udf_options { unsigned int fileset; unsigned int rootdir; unsigned int flags; - mode_t umask; + umode_t umask; gid_t gid; uid_t uid; - mode_t fmode; - mode_t dmode; + umode_t fmode; + umode_t dmode; struct nls_table *nls_map; }; @@ -250,9 +249,9 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) return 0; } -static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) +static int udf_show_options(struct seq_file *seq, struct dentry *root) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = root->d_sb; struct udf_sb_info *sbi = UDF_SB(sb); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) @@ -280,11 +279,11 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) seq_printf(seq, ",gid=%u", sbi->s_gid); if (sbi->s_umask != 0) - seq_printf(seq, ",umask=%o", sbi->s_umask); + seq_printf(seq, ",umask=%ho", sbi->s_umask); if (sbi->s_fmode != UDF_INVALID_MODE) - seq_printf(seq, ",mode=%o", sbi->s_fmode); + seq_printf(seq, ",mode=%ho", sbi->s_fmode); if (sbi->s_dmode != UDF_INVALID_MODE) - seq_printf(seq, ",dmode=%o", sbi->s_dmode); + seq_printf(seq, ",dmode=%ho", sbi->s_dmode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) seq_printf(seq, ",session=%u", sbi->s_session); if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) @@ -1799,6 +1798,12 @@ static void udf_close_lvid(struct super_block *sb) le16_to_cpu(lvid->descTag.descCRCLength))); lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + /* + * We set buffer uptodate unconditionally here to avoid spurious + * warnings from mark_buffer_dirty() when previous EIO has marked + * the buffer as !uptodate + */ + set_buffer_uptodate(bh); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index b1d4488b0f1..d7c6dbe4194 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from, pc = (struct pathComponent *)(from + elen); switch (pc->componentType) { case 1: - if (pc->lengthComponentIdent == 0) { - p = to; - *p++ = '/'; - } + /* + * Symlink points to some place which should be agreed + * upon between originator and receiver of the media. Ignore. + */ + if (pc->lengthComponentIdent > 0) + break; + /* Fall through */ + case 2: + p = to; + *p++ = '/'; break; case 3: memcpy(p, "../", 3); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 5142a82e327..42ad69ac957 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -50,7 +50,7 @@ #define UDF_SPARABLE_MAP15 0x1522U #define UDF_METADATA_MAP25 0x2511U -#define UDF_INVALID_MODE ((mode_t)-1) +#define UDF_INVALID_MODE ((umode_t)-1) #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ @@ -127,11 +127,11 @@ struct udf_sb_info { struct buffer_head *s_lvid_bh; /* Default permissions */ - mode_t s_umask; + umode_t s_umask; gid_t s_gid; uid_t s_uid; - mode_t s_fmode; - mode_t s_dmode; + umode_t s_fmode; + umode_t s_dmode; /* Lock protecting consistency of above permission settings */ rwlock_t s_cred_lock; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index f34e6fc0cda..ebe10314e51 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -215,7 +215,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); -extern struct inode *udf_new_inode(struct inode *, int, int *); +extern struct inode *udf_new_inode(struct inode *, umode_t, int *); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 78a4c70d46b..4ec5c1085a8 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -170,7 +170,7 @@ static void ufs2_init_inodes_chunk(struct super_block *sb, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode * ufs_new_inode(struct inode * dir, int mode) +struct inode *ufs_new_inode(struct inode *dir, umode_t mode) { struct super_block * sb; struct ufs_sb_info * sbi; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 879b13436fa..9094e1d917b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -583,7 +583,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; - mode_t mode; + umode_t mode; /* * Copy data to the in-core inode. @@ -630,7 +630,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; - mode_t mode; + umode_t mode; UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); /* diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 639d4916224..38cac199edf 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -70,7 +70,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, +static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd) { struct inode *inode; @@ -94,7 +94,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, return err; } -static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) +static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; int err; @@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err = -EMLINK; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3915ade6f9a..5246ee3e560 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1351,9 +1351,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return 0; } -static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int ufs_show_options(struct seq_file *seq, struct dentry *root) { - struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); + struct ufs_sb_info *sbi = UFS_SB(root->d_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; const struct match_token *tp = tokens; @@ -1425,7 +1425,6 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) static void ufs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index c26f2bcec26..528750b7e70 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -104,7 +104,7 @@ extern const struct address_space_operations ufs_aops; /* ialloc.c */ extern void ufs_free_inode (struct inode *inode); -extern struct inode * ufs_new_inode (struct inode *, int); +extern struct inode * ufs_new_inode (struct inode *, umode_t); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); diff --git a/fs/xattr.c b/fs/xattr.c index 67583de8218..82f43376c7c 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -397,7 +397,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, error = mnt_want_write_file(f); if (!error) { error = setxattr(dentry, name, value, size, flags); - mnt_drop_write(f->f_path.mnt); + mnt_drop_write_file(f); } fput(f); return error; @@ -624,7 +624,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) error = mnt_want_write_file(f); if (!error) { error = removexattr(dentry, name); - mnt_drop_write(f->f_path.mnt); + mnt_drop_write_file(f); } fput(f); return error; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 292eff19803..ab7c53fe346 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone) extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -static inline int -kmem_shake_allow(gfp_t gfp_mask) -{ - return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); -} - #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 76e4266d2e7..ac702a6eab9 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -39,7 +39,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) struct posix_acl_entry *acl_e; struct posix_acl *acl; struct xfs_acl_entry *ace; - int count, i; + unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); if (count > XFS_ACL_MAX_ENTRIES) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 574d4ee9b62..74b9baf36ac 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,8 +111,7 @@ xfs_ioend_new_eof( xfs_fsize_t bsize; bsize = ioend->io_offset + ioend->io_size; - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); + isize = MIN(i_size_read(VFS_I(ip)), bsize); return isize > ip->i_d.di_size ? isize : 0; } @@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) } /* - * Update on-disk file size now that data has been written to disk. The - * current in-memory file size is i_size. If a write is beyond eof i_new_size - * will be the intended file size until i_size is updated. If this write does - * not extend all the way to the valid file size then restrict this update to - * the end of the write. + * Update on-disk file size now that data has been written to disk. * * This function does not block as blocking on the inode lock in IO completion * can lead to IO completion order dependency deadlocks.. If it can't get the @@ -1279,6 +1274,15 @@ xfs_end_io_direct_write( struct xfs_ioend *ioend = iocb->private; /* + * While the generic direct I/O code updates the inode size, it does + * so only after the end_io handler is called, which means our + * end_io handler thinks the on-disk size is outside the in-core + * size. To prevent this just update it a little bit earlier here. + */ + if (offset + size > i_size_read(ioend->io_inode)) + i_size_write(ioend->io_inode, offset + size); + + /* * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect * against double-freeing. @@ -1340,12 +1344,11 @@ xfs_vm_write_failed( if (to > inode->i_size) { /* - * punch out the delalloc blocks we have already allocated. We - * don't call xfs_setattr() to do this as we may be in the - * middle of a multi-iovec write and so the vfs inode->i_size - * will not match the xfs ip->i_size and so it will zero too - * much. Hence we jus truncate the page cache to zero what is - * necessary and punch the delalloc blocks directly. + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have + * made it to disk yet as the page is still locked at this + * point. */ struct xfs_inode *ip = XFS_I(inode); xfs_fileoff_t start_fsb; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1e5d97f86ea..08b9ac644c3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp) if (error) goto out; - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index c1b55e59655..d25eafd4d28 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -326,7 +322,6 @@ xfs_attr_fork_reset( ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_afp == NULL); - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || dp->i_d.di_format == XFS_DINODE_FMT_BTREE); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d0ab7883705..188ef2fbd62 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge( } /* -* Update the record referred to by cur to the value given + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given * by [off, bno, len, state]. * This either works (return 0) or gets an EFSCORRUPTED error. */ @@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); @@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); @@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real( } /* convert to a btree if necessary */ - if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); @@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Make space in the inode incore. */ @@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset( ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; - if (dfl_forkoff > ip->i_d.di_forkoff) { + if (dfl_forkoff > ip->i_d.di_forkoff) ip->i_d.di_forkoff = dfl_forkoff; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t); - } } } @@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork( int error; /* error return value */ ASSERT(XFS_IFORK_Q(ip) == 0); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork( error = XFS_ERROR(EINVAL); goto error1; } - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; xfs_bmap_init(&flist, &firstblock); @@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork( } else spin_unlock(&mp->m_sb_lock); } - if ((error = xfs_bmap_finish(&tp, &flist, &committed))) + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); error2: xfs_bmap_cancel(&flist); error1: xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; } @@ -3994,11 +3997,8 @@ xfs_bmap_one_block( xfs_bmbt_irec_t s; /* internal version of extent */ #ifndef DEBUG - if (whichfork == XFS_DATA_FORK) { - return S_ISREG(ip->i_d.di_mode) ? - (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : - (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); - } + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) return 0; @@ -4010,7 +4010,7 @@ xfs_bmap_one_block( xfs_bmbt_get_all(ep, &s); rval = s.br_startoff == 0 && s.br_blockcount == 1; if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); return rval; } @@ -4379,8 +4379,6 @@ xfs_bmapi_read( XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); @@ -4871,8 +4869,6 @@ xfs_bmapi_write( return XFS_ERROR(EIO); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); XFS_STATS_INC(xs_blk_mapw); @@ -4981,8 +4977,7 @@ xfs_bmapi_write( /* * Transform from btree to extents, give it cur. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; ASSERT(bma.cur); @@ -4992,10 +4987,10 @@ xfs_bmapi_write( if (error) goto error0; } - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); error = 0; error0: /* @@ -5095,8 +5090,7 @@ xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5322,7 +5316,8 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { @@ -5353,13 +5348,11 @@ nodelete: } } *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Convert to a btree if necessary. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5370,8 +5363,7 @@ nodelete: /* * transform from btree to extents, give it cur */ - else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + else if (xfs_bmap_wants_extents(ip, whichfork)) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5382,8 +5374,6 @@ nodelete: /* * transform from extents to local? */ - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; error0: /* @@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; out->bmv_block = -1; - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); fixlen -= out->bmv_offset; if (prealloced && out->bmv_offset + out->bmv_length == end) { /* Came to hole at EOF. Trim it. */ @@ -5522,7 +5512,7 @@ xfs_getbmap( fixlen = XFS_MAXIOFFSET(mp); } else { prealloced = 0; - fixlen = ip->i_size; + fixlen = XFS_ISIZE(ip); } } @@ -5551,7 +5541,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { + if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); if (error) goto out_unlock_iolock; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cf0ac056815..4dff85c7d7e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1370,7 +1370,7 @@ restart: goto restart; } /* - * clear the LRU reference count so the bufer doesn't get + * clear the LRU reference count so the buffer doesn't get * ignored in xfs_buf_rele(). */ atomic_set(&bp->b_lru_ref, 0); @@ -1701,12 +1701,8 @@ xfsbufd( struct list_head tmp; struct blk_plug plug; - if (unlikely(freezing(current))) { - set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); - } else { - clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); - } + if (unlikely(freezing(current))) + try_to_freeze(); /* sleep for a long time if there is nothing to do. */ if (list_empty(&target->bt_delwri_queue)) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bab046e859..df7ffb0affe 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -90,8 +90,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" } typedef enum { - XBT_FORCE_SLEEP = 0, - XBT_FORCE_FLUSH = 1, + XBT_FORCE_FLUSH = 0, } xfs_buftarg_flags_t; typedef struct xfs_buftarg { diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 654dc6f05ba..dd974a55c77 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -163,12 +163,14 @@ xfs_swap_extents_check_format( /* Check temp in extent form to max in target */ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return EINVAL; /* Check target in extent form to max in temp */ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; /* @@ -180,18 +182,25 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) - return EINVAL; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - ((XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) - return EINVAL; + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } return 0; } @@ -349,16 +358,6 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* - * Fix the in-memory data fork values that are dependent on the fork - * offset in the inode. We can't assume they remain the same as attr2 - * has dynamic fork offsets. - */ - ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / - (uint)sizeof(xfs_bmbt_rec_t); - - /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8a24f0c6c86..286a051f12c 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -68,7 +68,7 @@ xfs_trim_extents( * Look up the longest btree in the AGF and start with it. */ error = xfs_alloc_lookup_le(cur, 0, - XFS_BUF_TO_AGF(agbp)->agf_longest, &i); + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); if (error) goto out_del_cursor; @@ -84,7 +84,7 @@ xfs_trim_extents( if (error) goto out_del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); - ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* * Too small? Give up. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 25d7280e9f6..53db20ee3e7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -39,20 +39,19 @@ #include "xfs_qm.h" #include "xfs_trace.h" - /* - LOCK ORDER - - inode lock (ilock) - dquot hash-chain lock (hashlock) - xqm dquot freelist lock (freelistlock - mount's dquot list lock (mplistlock) - user dquot lock - lock ordering among dquots is based on the uid or gid - group dquot lock - similar to udquots. Between the two dquots, the udquot - has to be locked first. - pin lock - the dquot lock must be held to take this lock. - flush lock - ditto. -*/ + * Lock order: + * + * ip->i_lock + * qh->qh_lock + * qi->qi_dqlist_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * xfs_Gqm->qm_dqfrlist_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; @@ -64,82 +63,6 @@ int xfs_dqerror_mod = 33; static struct lock_class_key xfs_dquot_other_class; /* - * Allocate and initialize a dquot. We don't always allocate fresh memory; - * we try to reclaim a free dquot if the number of incore dquots are above - * a threshold. - * The only field inside the core that gets initialized at this point - * is the d_id field. The idea is to fill in the entire q_core - * when we read in the on disk dquot. - */ -STATIC xfs_dquot_t * -xfs_qm_dqinit( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type) -{ - xfs_dquot_t *dqp; - boolean_t brandnewdquot; - - brandnewdquot = xfs_qm_dqalloc_incore(&dqp); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); - dqp->q_mount = mp; - - /* - * No need to re-initialize these if this is a reclaimed dquot. - */ - if (brandnewdquot) { - INIT_LIST_HEAD(&dqp->q_freelist); - mutex_init(&dqp->q_qlock); - init_waitqueue_head(&dqp->q_pinwait); - - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&dqp->q_flush); - complete(&dqp->q_flush); - - trace_xfs_dqinit(dqp); - } else { - /* - * Only the q_core portion was zeroed in dqreclaim_one(). - * So, we need to reset others. - */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - INIT_LIST_HEAD(&dqp->q_mplist); - INIT_LIST_HEAD(&dqp->q_hashlist); - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(list_empty(&dqp->q_freelist)); - - trace_xfs_dqreuse(dqp); - } - - /* - * In either case we need to make sure group quotas have a different - * lock class than user quotas, to make sure lockdep knows we can - * locks of one of each at the same time. - */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); - - /* - * log item gets initialized later - */ - return (dqp); -} - -/* * This is called to free all the memory associated with a dquot */ void @@ -155,24 +78,6 @@ xfs_qm_dqdestroy( } /* - * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. - */ -STATIC void -xfs_qm_dqinit_core( - xfs_dqid_t id, - uint type, - xfs_dqblk_t *d) -{ - /* - * Caller has zero'd the entire dquot 'chunk' already. - */ - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_id = cpu_to_be32(id); - d->dd_diskdq.d_flags = type; -} - -/* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this * is not the root dquot. @@ -234,10 +139,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_btimer) { if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_softlimit))) || (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_btimelimit); @@ -246,10 +151,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_softlimit))) && (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = 0; } @@ -257,10 +162,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_itimer) { if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_softlimit))) || (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_itimelimit); @@ -269,10 +174,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_softlimit))) && (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = 0; } @@ -280,10 +185,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_rtbtimer) { if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_softlimit))) || (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_rtbtimelimit); @@ -292,10 +197,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_softlimit))) && (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = 0; } @@ -328,8 +233,13 @@ xfs_qm_init_dquot_blk( curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); memset(d, 0, BBTOB(q->qi_dqchunklen)); - for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) - xfs_qm_dqinit_core(curid, type, d); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + } + xfs_trans_dquot_buf(tp, bp, (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : @@ -564,36 +474,87 @@ xfs_qm_dqtobp( * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. */ -/* ARGSUSED */ -STATIC int +int xfs_qm_dqread( - xfs_trans_t **tpp, - xfs_dqid_t id, - xfs_dquot_t *dqp, /* dquot to get filled in */ - uint flags) + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) { - xfs_disk_dquot_t *ddqp; - xfs_buf_t *bp; - int error; - xfs_trans_t *tp; + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; + + + dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_freelist); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); - ASSERT(tpp); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + + atomic_inc(&xfs_Gqm->qm_totaldquots); trace_xfs_dqread(dqp); + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + /* + * Round the chunklen up to the next multiple + * of 128 (buf log item chunk size)). + */ + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + /* * get a pointer to the on-disk dquot and the buffer containing it * dqp already knows its own type (GROUP/USER). */ - if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { - return (error); + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; } - tp = *tpp; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); xfs_qm_dquot_logitem_init(dqp); /* @@ -622,77 +583,22 @@ xfs_qm_dqread( ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); - return (error); -} - - -/* - * allocate an incore dquot from the kernel heap, - * and fill its core with quota information kept on disk. - * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk - * if it wasn't already allocated. - */ -STATIC int -xfs_qm_idtodq( - xfs_mount_t *mp, - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ - uint flags, /* DQALLOC, DQREPAIR */ - xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ -{ - xfs_dquot_t *dqp; - int error; - xfs_trans_t *tp; - int cancelflags=0; - - dqp = xfs_qm_dqinit(mp, id, type); - tp = NULL; - if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error) { - cancelflags = 0; - goto error0; - } - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - } - - /* - * Read it from disk; xfs_dqread() takes care of - * all the necessary initialization of dquot's fields (locks, etc) - */ - if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; - goto error0; - } if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) - goto error1; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; } *O_dqpp = dqp; - return (0); + return error; - error0: - ASSERT(error); +error1: if (tp) xfs_trans_cancel(tp, cancelflags); - error1: +error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; - return (error); + return error; } /* @@ -710,12 +616,9 @@ xfs_qm_dqlookup( xfs_dquot_t **O_dqpp) { xfs_dquot_t *dqp; - uint flist_locked; ASSERT(mutex_is_locked(&qh->qh_lock)); - flist_locked = B_FALSE; - /* * Traverse the hashchain looking for a match */ @@ -725,70 +628,31 @@ xfs_qm_dqlookup( * dqlock to look at the id field of the dquot, since the * id can't be modified without the hashlock anyway. */ - if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - trace_xfs_dqlookup_found(dqp); + if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp) + continue; - /* - * All in core dquots must be on the dqlist of mp - */ - ASSERT(!list_empty(&dqp->q_mplist)); - - xfs_dqlock(dqp); - if (dqp->q_nrefs == 0) { - ASSERT(!list_empty(&dqp->q_freelist)); - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqlookup_want(dqp); - - /* - * We may have raced with dqreclaim_one() - * (and lost). So, flag that we don't - * want the dquot to be reclaimed. - */ - dqp->dq_flags |= XFS_DQ_WANT; - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); - dqp->dq_flags &= ~(XFS_DQ_WANT); - } - flist_locked = B_TRUE; - } + trace_xfs_dqlookup_found(dqp); - /* - * id couldn't have changed; we had the hashlock all - * along - */ - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - - if (flist_locked) { - if (dqp->q_nrefs != 0) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - flist_locked = B_FALSE; - } else { - /* take it off the freelist */ - trace_xfs_dqlookup_freelist(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - } - } + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + *O_dqpp = NULL; + xfs_dqunlock(dqp); + return -1; + } - XFS_DQHOLD(dqp); + dqp->q_nrefs++; - if (flist_locked) - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * move the dquot to the front of the hashchain - */ - ASSERT(mutex_is_locked(&qh->qh_lock)); - list_move(&dqp->q_hashlist, &qh->qh_list); - trace_xfs_dqlookup_done(dqp); - *O_dqpp = dqp; - return 0; - } + /* + * move the dquot to the front of the hashchain + */ + list_move(&dqp->q_hashlist, &qh->qh_list); + trace_xfs_dqlookup_done(dqp); + *O_dqpp = dqp; + return 0; } *O_dqpp = NULL; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (1); + return 1; } /* @@ -829,11 +693,7 @@ xfs_qm_dqget( return (EIO); } } -#endif - again: - -#ifdef DEBUG ASSERT(type == XFS_DQ_USER || type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); @@ -845,13 +705,21 @@ xfs_qm_dqget( ASSERT(ip->i_gdquot == NULL); } #endif + +restart: mutex_lock(&h->qh_lock); /* * Look in the cache (hashtable). * The chain is kept locked during lookup. */ - if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) { + case -1: + XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); + mutex_unlock(&h->qh_lock); + delay(1); + goto restart; + case 0: XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); /* * The dquot was found, moved to the front of the chain, @@ -862,9 +730,11 @@ xfs_qm_dqget( ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); mutex_unlock(&h->qh_lock); trace_xfs_dqget_hit(*O_dqpp); - return (0); /* success */ + return 0; /* success */ + default: + XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); + break; } - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -882,41 +752,18 @@ xfs_qm_dqget( version = h->qh_version; mutex_unlock(&h->qh_lock); - /* - * Allocate the dquot on the kernel heap, and read the ondisk - * portion off the disk. Also, do all the necessary initialization - * This can return ENOENT if dquot didn't exist on disk and we didn't - * ask it to allocate; ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_idtodq(mp, id, type, - flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| - XFS_QMOPT_DOWARN), - &dqp))) { - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - return (error); - } + error = xfs_qm_dqread(mp, id, type, flags, &dqp); - /* - * See if this is mount code calling to look at the overall quota limits - * which are stored in the id == 0 user or group's dquot. - * Since we may not have done a quotacheck by this point, just return - * the dquot without attaching it to any hashtables, lists, etc, or even - * taking a reference. - * The caller must dqdestroy this once done. - */ - if (flags & XFS_QMOPT_DQSUSER) { - ASSERT(id == 0); - ASSERT(! ip); - goto dqret; - } + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; /* * Dquot lock comes after hashlock in the lock ordering */ if (ip) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. @@ -961,16 +808,21 @@ xfs_qm_dqget( * lock order between the two dquots here since dqp isn't * on any findable lists yet. */ - if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { + switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) { + case 0: + case -1: /* - * Duplicate found. Just throw away the new dquot - * and start over. + * Duplicate found, either in cache or on its way out. + * Just throw away the new dquot and start over. */ - xfs_qm_dqput(tmpdqp); + if (tmpdqp) + xfs_qm_dqput(tmpdqp); mutex_unlock(&h->qh_lock); xfs_qm_dqdestroy(dqp); XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto again; + goto restart; + default: + break; } } @@ -1015,67 +867,49 @@ xfs_qm_dqget( */ void xfs_qm_dqput( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dquot_t *gdqp; + struct xfs_dquot *gdqp; ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); - if (dqp->q_nrefs != 1) { - dqp->q_nrefs--; +recurse: + if (--dqp->q_nrefs > 0) { xfs_dqunlock(dqp); return; } + trace_xfs_dqput_free(dqp); + + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + if (list_empty(&dqp->q_freelist)) { + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + /* - * drop the dqlock and acquire the freelist and dqlock - * in the right order; but try to get it out-of-order first + * If we just added a udquot to the freelist, then we want to release + * the gdquot reference that it (probably) has. Otherwise it'll keep + * the gdquot from getting reclaimed. */ - if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { - trace_xfs_dqput_wait(dqp); - xfs_dqunlock(dqp); - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - xfs_dqlock(dqp); + gdqp = dqp->q_gdquot; + if (gdqp) { + xfs_dqlock(gdqp); + dqp->q_gdquot = NULL; } + xfs_dqunlock(dqp); - while (1) { - gdqp = NULL; - - /* We can't depend on nrefs being == 1 here */ - if (--dqp->q_nrefs == 0) { - trace_xfs_dqput_free(dqp); - - list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - xfs_Gqm->qm_dqfrlist_cnt++; - - /* - * If we just added a udquot to the freelist, then - * we want to release the gdquot reference that - * it (probably) has. Otherwise it'll keep the - * gdquot from getting reclaimed. - */ - if ((gdqp = dqp->q_gdquot)) { - /* - * Avoid a recursive dqput call - */ - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - } - xfs_dqunlock(dqp); - - /* - * If we had a group quota inside the user quota as a hint, - * release it now. - */ - if (! gdqp) - break; + /* + * If we had a group quota hint, release it now. + */ + if (gdqp) { dqp = gdqp; + goto recurse; } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1169,7 +1003,7 @@ xfs_qm_dqflush( * If not dirty, or it's pinned and we are not supposed to block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || - (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { + ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); return 0; } @@ -1257,40 +1091,17 @@ xfs_qm_dqflush( } -int -xfs_qm_dqlock_nowait( - xfs_dquot_t *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -void -xfs_dqlock( - xfs_dquot_t *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - void xfs_dqunlock( xfs_dquot_t *dqp) { - mutex_unlock(&(dqp->q_qlock)); + xfs_dqunlock_nonotify(dqp); if (dqp->q_logitem.qli_dquot == dqp) { - /* Once was dqp->q_mount, but might just have been cleared */ xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, - (xfs_log_item_t*)&(dqp->q_logitem)); + &dqp->q_logitem.qli_item); } } - -void -xfs_dqunlock_nonotify( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); -} - /* * Lock two xfs_dquot structures. * @@ -1319,43 +1130,18 @@ xfs_dqlock2( } } - /* - * Take a dquot out of the mount's dqlist as well as the hashlist. - * This is called via unmount as well as quotaoff, and the purge - * will always succeed unless there are soft (temp) references - * outstanding. - * - * This returns 0 if it was purged, 1 if it wasn't. It's not an error code - * that we're returning! XXXsup - not cool. + * Take a dquot out of the mount's dqlist as well as the hashlist. This is + * called via unmount as well as quotaoff, and the purge will always succeed. */ -/* ARGSUSED */ -int +void xfs_qm_dqpurge( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { - xfs_dqhash_t *qh = dqp->q_hash; - xfs_mount_t *mp = dqp->q_mount; - - ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); - ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); + struct xfs_mount *mp = dqp->q_mount; + struct xfs_dqhash *qh = dqp->q_hash; xfs_dqlock(dqp); - /* - * We really can't afford to purge a dquot that is - * referenced, because these are hard refs. - * It shouldn't happen in general because we went thru _all_ inodes in - * dqrele_all_inodes before calling this and didn't let the mountlock go. - * However it is possible that we have dquots with temporary - * references that are not attached to an inode. e.g. see xfs_setattr(). - */ - if (dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - mutex_unlock(&dqp->q_hash->qh_lock); - return (1); - } - - ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1370,23 +1156,18 @@ xfs_qm_dqpurge( * Block on the flush lock after nudging dquot buffer, * if it is incore. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* - * XXXIf we're turning this type of quotas off, we don't care + * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { int error; - /* dqflush unlocks dqflock */ /* - * Given that dqpurge is a very rare occurrence, it is OK - * that we're holding the hashlist and mplist locks - * across the disk write. But, ... XXXsup - * * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ @@ -1396,38 +1177,44 @@ xfs_qm_dqpurge( __func__, dqp); xfs_dqflock(dqp); } + ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + mutex_lock(&qh->qh_lock); list_del_init(&dqp->q_hashlist); qh->qh_version++; + mutex_unlock(&qh->qh_lock); + + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); list_del_init(&dqp->q_mplist); mp->m_quotainfo->qi_dqreclaims++; mp->m_quotainfo->qi_dquots--; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + /* - * XXX Move this to the front of the freelist, if we can get the - * freelist lock. + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. */ + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); ASSERT(!list_empty(&dqp->q_freelist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - dqp->q_mount = NULL; - dqp->q_hash = NULL; - dqp->dq_flags = XFS_DQ_INACTIVE; - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - mutex_unlock(&qh->qh_lock); - return (0); + xfs_qm_dqdestroy(dqp); } - /* * Give the buffer a little push if it is incore and * wait on the flush lock. */ void -xfs_qm_dqflock_pushbuf_wait( +xfs_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { xfs_mount_t *mp = dqp->q_mount; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e945dbf..a1d91d8f180 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -80,8 +80,6 @@ enum { XFS_QLOCK_NESTED, }; -#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -102,6 +100,21 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) @@ -116,12 +129,12 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ (XFS_IS_OQUOTA_ON((d)->q_mount)))) +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *); +extern void xfs_qm_dqpurge(xfs_dquot_t *); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, @@ -129,9 +142,17 @@ extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); -extern void xfs_dqlock(xfs_dquot_t *); -extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); -extern void xfs_dqunlock(xfs_dquot_t *); -extern void xfs_dqunlock_nonotify(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); +extern void xfs_dqunlock(struct xfs_dquot *); +extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 0dee0b71029..34baeae4526 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -73,7 +73,6 @@ xfs_qm_dquot_logitem_format( logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == lip->li_desc->lid_size); qlip->qli_format.qlf_size = 2; } @@ -134,7 +133,7 @@ xfs_qm_dquot_logitem_push( * lock without sleeping, then there must not have been * anyone in the process of flushing the dquot. */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); if (error) xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", __func__, error, dqp); @@ -237,7 +236,7 @@ xfs_qm_dquot_logitem_trylock( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_qm_dqlock_nowait(dqp)) + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; if (!xfs_dqflock_nowait(dqp)) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 753ed9b5c70..7e5bc872f2b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -209,10 +209,10 @@ xfs_file_fsync( /* * First check if the VFS inode is marked dirty. All the dirtying - * of non-transactional updates no goes through mark_inode_dirty*, - * which allows us to distinguish beteeen pure timestamp updates + * of non-transactional updates do not go through mark_inode_dirty*, + * which allows us to distinguish between pure timestamp updates * and i_size updates which need to be caught for fdatasync. - * After that also theck for the dirty state in the XFS inode, which + * After that also check for the dirty state in the XFS inode, which * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ @@ -327,7 +327,7 @@ xfs_file_aio_read( mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { - if (iocb->ki_pos == ip->i_size) + if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } @@ -412,51 +412,6 @@ xfs_file_splice_read( return ret; } -STATIC void -xfs_aio_write_isize_update( - struct inode *inode, - loff_t *ppos, - ssize_t bytes_written) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t isize = i_size_read(inode); - - if (bytes_written > 0) - XFS_STATS_ADD(xs_write_bytes, bytes_written); - - if (unlikely(bytes_written < 0 && bytes_written != -EFAULT && - *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - -/* - * If this was a direct or synchronous I/O that failed (such as ENOSPC) then - * part of the I/O may have been written to disk before the error occurred. In - * this case the on-disk file size may have been adjusted beyond the in-memory - * file size and now needs to be truncated back. - */ -STATIC void -xfs_aio_write_newsize_update( - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - if (new_size == ip->i_new_size) { - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - if (new_size == ip->i_new_size) - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - } -} - /* * xfs_file_splice_write() does not use xfs_rw_ilock() because * generic_file_splice_write() takes the i_mutex itself. This, in theory, @@ -475,7 +430,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int ioflags = 0; ssize_t ret; @@ -489,19 +443,12 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_aio_write_isize_update(inode, ppos, ret); - xfs_aio_write_newsize_update(ip, new_size); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -689,28 +636,26 @@ out_lock: /* * Common pre-write limit and setup checks. * - * Returns with iolock held according to @iolock. + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, - xfs_fsize_t *new_sizep, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); - *new_sizep = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); - *iolock = 0; + xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -720,36 +665,21 @@ restart: /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. There is no need to issue zeroing if another in-flght IO ends - * at or before this one If zeronig is needed and we are currently - * holding the iolock shared, we need to update it to exclusive which - * involves dropping all locks and relocking to maintain correct locking - * order. If we do this, restart the function to ensure all checks and - * values are still valid. + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which involves + * dropping all locks and relocking to maintain correct locking order. + * If we do this, restart the function to ensure all checks and values + * are still valid. */ - if ((ip->i_new_size && *pos > ip->i_new_size) || - (!ip->i_new_size && *pos > ip->i_size)) { + if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } - error = -xfs_zero_eof(ip, *pos, ip->i_size); + error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } - - /* - * If this IO extends beyond EOF, we may need to update ip->i_new_size. - * We have already zeroed space beyond EOF (if necessary). Only update - * ip->i_new_size if this IO ends beyond any other in-flight writes. - */ - new_size = *pos + *count; - if (new_size > ip->i_size) { - if (new_size > ip->i_new_size) - ip->i_new_size = new_size; - *new_sizep = new_size; - } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -794,9 +724,7 @@ xfs_file_dio_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -806,10 +734,10 @@ xfs_file_dio_aio_write( ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; + int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); @@ -824,31 +752,31 @@ xfs_file_dio_aio_write( * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) - *iolock = XFS_IOLOCK_EXCL; + iolock = XFS_IOLOCK_EXCL; else - *iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, *iolock); + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ - if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) - return ret; + goto out; } /* @@ -857,15 +785,18 @@ xfs_file_dio_aio_write( */ if (unaligned_io) inode_dio_wait(inode); - else if (*iolock == XFS_IOLOCK_EXCL) { + else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - *iolock = XFS_IOLOCK_SHARED; + iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); +out: + xfs_rw_iunlock(ip, iolock); + /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; @@ -877,9 +808,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount, - xfs_fsize_t *new_size, - int *iolock) + size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -887,14 +816,14 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; size_t count = ocount; - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) - return ret; + goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -908,13 +837,15 @@ write_retry: * page locks and retry *once* */ if (ret == -ENOSPC && !enospc) { - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (ret) - return ret; enospc = 1; - goto write_retry; + ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (!ret) + goto write_retry; } + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -930,9 +861,7 @@ xfs_file_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int iolock; size_t ocount = 0; - xfs_fsize_t new_size = 0; XFS_STATS_INC(xs_write_calls); @@ -951,33 +880,22 @@ xfs_file_aio_write( return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); + ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); else ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount, &new_size, &iolock); - - xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); + ocount); - if (ret <= 0) - goto out_unlock; + if (ret > 0) { + ssize_t err; - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error; + XFS_STATS_ADD(xs_write_bytes, ret); - xfs_rw_iunlock(ip, iolock); - error = xfs_file_fsync(file, pos, end, - (file->f_flags & __O_SYNC) ? 0 : 1); - xfs_rw_ilock(ip, iolock); - if (error) - ret = error; + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, pos, ret); + if (err < 0) + ret = err; } -out_unlock: - xfs_aio_write_newsize_update(ip, new_size); - xfs_rw_iunlock(ip, iolock); return ret; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed16811..652b875a9d4 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -90,7 +90,7 @@ xfs_wait_on_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { return -filemap_fdatawait_range(mapping, first, - last == -1 ? ip->i_size - 1 : last); + last == -1 ? XFS_ISIZE(ip) - 1 : last); } return 0; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 169380e6605..dad1a31aa4f 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -447,7 +447,7 @@ STATIC xfs_buf_t * /* allocation group buffer */ xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - mode_t mode, /* bits set to indicate file type */ + umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { xfs_buf_t *agbp; /* allocation group header buffer */ @@ -640,7 +640,7 @@ int xfs_dialloc( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ boolean_t *alloc_done, /* true if we needed to replenish diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index bb5385475e1..666a037398d 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -81,7 +81,7 @@ int /* error */ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ boolean_t *alloc_done, /* an allocation was done to replenish diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0fa98b1c70e..8c3e46394d4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -77,7 +77,7 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -94,8 +94,6 @@ xfs_inode_alloc( ip->i_update_core = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - ip->i_size = 0; - ip->i_new_size = 0; return ip; } @@ -107,7 +105,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - INIT_LIST_HEAD(&inode->i_dentry); kmem_zone_free(xfs_inode_zone, ip); } @@ -151,7 +148,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); + ASSERT(!xfs_isiflocked(ip)); /* * Because we use RCU freeing we need to ensure the inode always @@ -451,8 +448,6 @@ again: *ipp = ip; - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* * If we have a real type for an on-disk inode, we can set ops(&unlock) * now. If it's a new inode being created, xfs_ialloc will handle it. @@ -716,3 +711,19 @@ xfs_isilocked( return 0; } #endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 755ee816488..b21022499c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -299,11 +299,8 @@ xfs_iformat( { xfs_attr_shortform_t *atp; int size; - int error; + int error = 0; xfs_fsize_t di_size; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - error = 0; if (unlikely(be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > @@ -350,7 +347,6 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; @@ -409,10 +405,10 @@ xfs_iformat( } if (!XFS_DFORK_Q(dip)) return 0; + ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); + switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); @@ -604,10 +600,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max - || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) - || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, @@ -835,12 +832,6 @@ xfs_iread( * with the uninitialized part of it. */ ip->i_d.di_mode = 0; - /* - * Initialize the per-fork minima and maxima for a new - * inode here. xfs_iformat will do it for old inodes. - */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } /* @@ -861,7 +852,6 @@ xfs_iread( } ip->i_delayed_blks = 0; - ip->i_size = ip->i_d.di_size; /* * Mark the buffer containing the inode as something to keep @@ -961,7 +951,7 @@ int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, @@ -1002,7 +992,7 @@ xfs_ialloc( return error; ASSERT(ip != NULL); - ip->i_d.di_mode = (__uint16_t)mode; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); @@ -1051,7 +1041,6 @@ xfs_ialloc( } ip->i_d.di_size = 0; - ip->i_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); @@ -1166,52 +1155,6 @@ xfs_ialloc( } /* - * Check to make sure that there are no blocks allocated to the - * file beyond the size of the file. We don't check this for - * files with fixed size extents or real time extents, but we - * at least do it for regular files. - */ -#ifdef DEBUG -STATIC void -xfs_isize_check( - struct xfs_inode *ip, - xfs_fsize_t isize) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; - int error; - - if (!S_ISREG(ip->i_d.di_mode)) - return; - - if (XFS_IS_REALTIME_INODE(ip)) - return; - - if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) - return; - - nimaps = 2; - map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - /* - * The filesystem could be shutting down, so bmapi may return - * an error. - */ - error = xfs_bmapi_read(ip, map_first, - (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), - imaps, &nimaps, XFS_BMAPI_ENTIRE); - if (error) - return; - ASSERT(nimaps == 1); - ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); -} -#else /* DEBUG */ -#define xfs_isize_check(ip, isize) -#endif /* DEBUG */ - -/* * Free up the underlying blocks past new_size. The new size must be smaller * than the current size. This routine can be used both for the attribute and * data fork, and does not modify the inode size, which is left to the caller. @@ -1252,12 +1195,14 @@ xfs_itruncate_extents( int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(new_size <= ip->i_size); + ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); ASSERT(ip->i_itemp->ili_lock_flags == 0); ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + trace_xfs_itruncate_extents_start(ip, new_size); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1325,6 +1270,14 @@ xfs_itruncate_extents( goto out; } + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + out: *tpp = tp; return error; @@ -1338,74 +1291,6 @@ out_bmap_cancel: goto out; } -int -xfs_itruncate_data( - struct xfs_trans **tpp, - struct xfs_inode *ip, - xfs_fsize_t new_size) -{ - int error; - - trace_xfs_itruncate_data_start(ip, new_size); - - /* - * The first thing we do is set the size to new_size permanently on - * disk. This way we don't have to worry about anyone ever being able - * to look at the data being freed even in the face of a crash. - * What we're getting around here is the case where we free a block, it - * is allocated to another file, it is written to, and then we crash. - * If the new data gets written to the file but the log buffers - * containing the free and reallocation don't, then we'd end up with - * garbage in the blocks being freed. As long as we make the new_size - * permanent before actually freeing any blocks it doesn't matter if - * they get written to. - */ - if (ip->i_d.di_nextents > 0) { - /* - * If we are not changing the file size then do not update - * the on-disk file size - we may be called from - * xfs_inactive_free_eofblocks(). If we update the on-disk - * file size and then the system crashes before the contents - * of the file are flushed to disk then the files may be - * full of holes (ie NULL files bug). - */ - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - } - } - - error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); - if (error) - return error; - - /* - * If we are not changing the file size then do not update the on-disk - * file size - we may be called from xfs_inactive_free_eofblocks(). - * If we update the on-disk file size and then the system crashes - * before the contents of the file are flushed to disk then the files - * may be full of holes (ie NULL files bug). - */ - xfs_isize_check(ip, new_size); - if (ip->i_size != new_size) { - ip->i_d.di_size = new_size; - ip->i_size = new_size; - } - - ASSERT(new_size != 0 || ip->i_delayed_blks == 0); - ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); - - /* - * Always re-log the inode so that our permanent transaction can keep - * on rolling it forward in the log. - */ - xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); - - trace_xfs_itruncate_data_end(ip, new_size); - return 0; -} - /* * This is called when the inode's link count goes to 0. * We place the on-disk inode on a list in the AGI. It @@ -1824,8 +1709,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - (!S_ISREG(ip->i_d.di_mode))); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -1844,8 +1728,6 @@ xfs_ifree( ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -2151,7 +2033,7 @@ xfs_idestroy_fork( * once someone is waiting for it to be unpinned. */ static void -xfs_iunpin_nowait( +xfs_iunpin( struct xfs_inode *ip) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); @@ -2163,14 +2045,29 @@ xfs_iunpin_nowait( } +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + void xfs_iunpin_wait( struct xfs_inode *ip) { - if (xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); - wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); - } + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); } /* @@ -2510,9 +2407,9 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; @@ -2529,7 +2426,7 @@ xfs_iflush( * out for us if they occur after the log force completes. */ if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin_nowait(ip); + xfs_iunpin(ip); xfs_ifunlock(ip); return EAGAIN; } @@ -2626,9 +2523,9 @@ xfs_iflush_int( #endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); iip = ip->i_itemp; mp = ip->i_mount; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b4cd4739f98..2f27b745408 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -66,7 +66,6 @@ typedef struct xfs_ifork { struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ - unsigned char if_ext_max; /* max # of extent records */ union { xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ @@ -206,12 +205,12 @@ typedef struct xfs_icdinode { ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ ((ip)->i_d.di_anextents = (n))) - +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) #ifdef __KERNEL__ -struct bhv_desc; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -220,12 +219,6 @@ struct xfs_mount; struct xfs_trans; struct xfs_dquot; -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; - typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ @@ -244,27 +237,19 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - struct completion i_flush; /* inode flush completion q */ atomic_t i_pincount; /* inode pin count */ - wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned short i_flags; /* see defined flags below */ + unsigned long i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_fsize_t i_size; /* in-memory size */ - xfs_fsize_t i_new_size; /* size when write completes */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ - (ip)->i_size : (ip)->i_d.di_size; - /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { @@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* * i_flags helper functions */ static inline void @@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was chosen @@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip, } /* - * Manage the i_flush queue embedded in the inode. This completion - * queue synchronizes processes attempting to flush the in-core - * inode back to disk. - */ -static inline void xfs_iflock(xfs_inode_t *ip) -{ - wait_for_completion(&ip->i_flush); -} - -static inline int xfs_iflock_nowait(xfs_inode_t *ip) -{ - return try_wait_for_completion(&ip->i_flush); -} - -static inline void xfs_ifunlock(xfs_inode_t *ip) -{ - complete(&ip->i_flush); -} - -/* * In-core inode flags. */ -#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ -#define XFS_ISTALE 0x0002 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ -#define XFS_INEW 0x0008 /* inode has just been allocated */ -#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ -#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ -#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) XFS_IFILESTREAM); /* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) @@ -481,7 +503,7 @@ void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. */ -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); @@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); -int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *, - xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index abaafdbb3e6..91d71dcd485 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -79,8 +79,6 @@ xfs_inode_item_size( break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); iip->ili_format.ilf_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV | XFS_ILOG_UUID); @@ -437,7 +435,6 @@ xfs_inode_item_format( * Assert that no attribute-related log flags are set. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); @@ -521,7 +518,6 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; } @@ -559,7 +555,7 @@ xfs_inode_item_unpin( trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } /* @@ -721,7 +717,7 @@ xfs_inode_item_pushbuf( * If a flush is not in progress anymore, chances are that the * inode was taken off the AIL. So, just get out. */ - if (completion_done(&ip->i_flush) || + if (!xfs_isiflocked(ip) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return true; @@ -754,7 +750,7 @@ xfs_inode_item_push( struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(!completion_done(&ip->i_flush)); + ASSERT(xfs_isiflocked(ip)); /* * Since we were able to lock the inode's flush lock and diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d99a9051890..76f3ca5cfc3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -559,23 +559,23 @@ xfs_attrmulti_by_handle( ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623bfbb8..f9ccb7b7c04 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -454,23 +454,23 @@ xfs_compat_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + ops[i].am_error = mnt_want_write_file(parfilp); if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); - mnt_drop_write(parfilp->f_path.mnt); + mnt_drop_write_file(parfilp); break; default: ops[i].am_error = EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 9afa282aa93..246c7d57c6f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb( xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align; + xfs_extlen_t align = 0; int eof, error; - if (XFS_IS_REALTIME_INODE(ip)) - ; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) - new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) - new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) + new_last_fsb = roundup_64(*last_fsb, align); + } /* * Always round up the allocation request to an extent boundary @@ -154,7 +154,7 @@ xfs_iomap_write_direct( offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > ip->i_size) { + if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) goto error_out; @@ -211,7 +211,7 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); bmapi_flag = 0; - if (offset < ip->i_size || extsz) + if (offset < XFS_ISIZE(ip) || extsz) bmapi_flag |= XFS_BMAPI_PREALLOC; /* @@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate( int found_delalloc = 0; *prealloc = 0; - if ((offset + count) <= ip->i_size) + if (offset + count <= XFS_ISIZE(ip)) return 0; /* @@ -340,7 +340,7 @@ xfs_iomap_prealloc_size( * if we pass in alloc_blocks = 0. Hence the "+ 1" to * ensure we always pass in a non-zero value. */ - alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; + alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -564,7 +564,7 @@ xfs_iomap_write_allocate( * back.... */ nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, ip->i_size); + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); error = xfs_bmap_last_offset(NULL, ip, &last_block, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 23ce927973a..ab302539e5b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -168,7 +168,7 @@ STATIC int xfs_vn_mknod( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, dev_t rdev) { struct inode *inode; @@ -231,7 +231,7 @@ STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, - int mode, + umode_t mode, struct nameidata *nd) { return xfs_vn_mknod(dir, dentry, mode, 0); @@ -241,7 +241,7 @@ STATIC int xfs_vn_mkdir( struct inode *dir, struct dentry *dentry, - int mode) + umode_t mode) { return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); } @@ -366,7 +366,7 @@ xfs_vn_symlink( struct xfs_inode *cip = NULL; struct xfs_name name; int error; - mode_t mode; + umode_t mode; mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); @@ -750,6 +750,7 @@ xfs_setattr_size( struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); int mask = iattr->ia_valid; + xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; uint lock_flags; @@ -777,11 +778,13 @@ xfs_setattr_size( lock_flags |= XFS_IOLOCK_EXCL; xfs_ilock(ip, lock_flags); + oldsize = inode->i_size; + newsize = iattr->ia_size; + /* * Short circuit the truncate case for zero length files. */ - if (iattr->ia_size == 0 && - ip->i_size == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { if (!(mask & (ATTR_CTIME|ATTR_MTIME))) goto out_unlock; @@ -807,14 +810,14 @@ xfs_setattr_size( * the inode to the transaction, because the inode cannot be unlocked * once it is a part of the transaction. */ - if (iattr->ia_size > ip->i_size) { + if (newsize > oldsize) { /* * Do the first part of growing a file: zero any data in the * last block that is beyond the old EOF. We need to do this * before the inode is joined to the transaction to modify * i_size. */ - error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); + error = xfs_zero_eof(ip, newsize, oldsize); if (error) goto out_unlock; } @@ -833,8 +836,8 @@ xfs_setattr_size( * here and prevents waiting for other data not within the range we * care about here. */ - if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, + if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, FI_NONE); if (error) goto out_unlock; @@ -845,8 +848,7 @@ xfs_setattr_size( */ inode_dio_wait(inode); - error = -block_truncate_page(inode->i_mapping, iattr->ia_size, - xfs_get_blocks); + error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) goto out_unlock; @@ -857,7 +859,7 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - truncate_setsize(inode, iattr->ia_size); + truncate_setsize(inode, newsize); commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; @@ -876,19 +878,29 @@ xfs_setattr_size( * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (iattr->ia_size != ip->i_size && - (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { iattr->ia_ctime = iattr->ia_mtime = current_fs_time(inode->i_sb); mask |= ATTR_CTIME | ATTR_MTIME; } - if (iattr->ia_size > ip->i_size) { - ip->i_d.di_size = iattr->ia_size; - ip->i_size = iattr->ia_size; - } else if (iattr->ia_size <= ip->i_size || - (iattr->ia_size == 0 && ip->i_d.di_nextents)) { - error = xfs_itruncate_data(&tp, ip, iattr->ia_size); + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) goto out_trans_abort; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 34817adf4b9..e2cc3568c29 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -760,38 +760,6 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_cil); } -/* - * Write region vectors to log. The write happens using the space reservation - * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). However, it is important - * to note that the transaction reservation code makes an assumption about the - * number of log headers a transaction requires that may be violated if you - * don't pass all the transaction vectors in one call.... - */ -int -xfs_log_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn) -{ - struct log *log = mp->m_log; - int error; - struct xfs_log_vec vec = { - .lv_niovecs = nentries, - .lv_iovecp = reg, - }; - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - void xfs_log_move_tail(xfs_mount_t *mp, xfs_lsn_t tail_lsn) @@ -1685,7 +1653,7 @@ xlog_print_tic_res( }; xfs_warn(mp, - "xfs_log_write: reservation summary:\n" + "xlog_write: reservation summary:\n" " trans type = %s (%u)\n" " unit res = %d bytes\n" " current res = %d bytes\n" @@ -1714,7 +1682,7 @@ xlog_print_tic_res( } xfs_alert_tag(mp, XFS_PTAG_LOGRES, - "xfs_log_write: reservation ran out. Need to up reservation"); + "xlog_write: reservation ran out. Need to up reservation"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1968,23 +1936,21 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (log->l_cilp) { - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. - */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - } else - ticket->t_curr_res -= len; + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); if (ticket->t_curr_res < 0) xlog_print_tic_res(log->l_mp, ticket); @@ -2931,8 +2897,7 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) - xlog_cil_force(log); + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3081,11 +3046,9 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); - if (log->l_cilp) { - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; - } + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; try_again: spin_lock(&log->l_icloglock); @@ -3653,7 +3616,7 @@ xfs_log_force_umount( * completed transactions are flushed to disk with the xfs_log_force() * call below. */ - if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + if (!logerror) xlog_cil_force(log); /* diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3f7bf451c03..2aee3b22d29 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -174,11 +174,6 @@ int xfs_log_reserve(struct xfs_mount *mp, __uint8_t clientid, uint flags, uint t_type); -int xfs_log_write(struct xfs_mount *mp, - xfs_log_iovec_t region[], - int nentries, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); @@ -189,8 +184,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c7755d5a5fb..d4fadbe8ac9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -32,10 +32,7 @@ #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. If the CIL is not - * enabled in this filesystem, ensure the log->l_cilp is null so - * we can check this conditional to determine if we are doing delayed - * logging or not. + * Perform initial CIL structure initialisation. */ int xlog_cil_init( @@ -44,10 +41,6 @@ xlog_cil_init( struct xfs_cil *cil; struct xfs_cil_ctx *ctx; - log->l_cilp = NULL; - if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) - return 0; - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); if (!cil) return ENOMEM; @@ -80,9 +73,6 @@ void xlog_cil_destroy( struct log *log) { - if (!log->l_cilp) - return; - if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); @@ -137,9 +127,6 @@ void xlog_cil_init_post_recovery( struct log *log) { - if (!log->l_cilp) - return; - log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, @@ -172,37 +159,73 @@ xlog_cil_init_post_recovery( * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ -static void -xlog_cil_format_items( - struct log *log, - struct xfs_log_vec *log_vector) +static struct xfs_log_vec * +xlog_cil_prepare_log_vecs( + struct xfs_trans *tp) { - struct xfs_log_vec *lv; + struct xfs_log_item_desc *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - ASSERT(log_vector); - for (lv = log_vector; lv; lv = lv->lv_next) { + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return NULL; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_vec *new_lv; void *ptr; int index; int len = 0; + uint niovecs; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* Skip items that do not have any vectors for writing */ + niovecs = IOP_SIZE(lidp->lid_item); + if (!niovecs) + continue; + + new_lv = kmem_zalloc(sizeof(*new_lv) + + niovecs * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = niovecs; + new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ - IOP_FORMAT(lv->lv_item, lv->lv_iovecp); - for (index = 0; index < lv->lv_niovecs; index++) - len += lv->lv_iovecp[index].i_len; + IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); + for (index = 0; index < new_lv->lv_niovecs; index++) + len += new_lv->lv_iovecp[index].i_len; - lv->lv_buf_len = len; - lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); - ptr = lv->lv_buf; + new_lv->lv_buf_len = len; + new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, + KM_SLEEP|KM_NOFS); + ptr = new_lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + for (index = 0; index < new_lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; memcpy(ptr, vec->i_addr, vec->i_len); vec->i_addr = ptr; ptr += vec->i_len; } - ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); + + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; } + + return ret_lv; } /* @@ -256,7 +279,7 @@ xfs_cil_prepare_item( * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space - * as well. Remove the amount of space we addded to the checkpoint ticket from + * as well. Remove the amount of space we added to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void @@ -635,28 +658,30 @@ out_abort: * background commit, returns without it held once background commits are * allowed again. */ -void +int xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags) { struct log *log = mp->m_log; int log_flags = 0; int push = 0; + struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) log_flags = XFS_LOG_REL_PERM_RESERV; /* - * do all the hard work of formatting items (including memory + * Do all the hard work of formatting items (including memory * allocation) outside the CIL context lock. This prevents stalling CIL * pushes when we are low on memory and a transaction commit spends a * lot of time in memory reclaim. */ - xlog_cil_format_items(log, log_vector); + log_vector = xlog_cil_prepare_log_vecs(tp); + if (!log_vector) + return ENOMEM; /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); @@ -709,6 +734,7 @@ xfs_log_commit_cil( */ if (push) xlog_cil_push(log, 0); + return 0; } /* @@ -786,8 +812,6 @@ xfs_log_item_in_current_chkpt( { struct xfs_cil_ctx *ctx; - if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) - return false; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 541a508adea..0ed9ee77937 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -1981,7 +1981,7 @@ xfs_qm_dqcheck( if (!errs && ddq->d_id) { if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit)) { if (!ddq->d_btimer) { if (flags & XFS_QMOPT_DOWARN) @@ -1992,7 +1992,7 @@ xfs_qm_dqcheck( } } if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit)) { if (!ddq->d_itimer) { if (flags & XFS_QMOPT_DOWARN) @@ -2003,7 +2003,7 @@ xfs_qm_dqcheck( } } if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit)) { if (!ddq->d_rtbtimer) { if (flags & XFS_QMOPT_DOWARN) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb24dac42a2..19f69e23250 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0bbb1a41998..c436def733b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -50,7 +50,6 @@ */ struct mutex xfs_Gqm_lock; struct xfs_qm *xfs_Gqm; -uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; @@ -93,7 +92,6 @@ xfs_Gqm_init(void) goto out_free_udqhash; hsize /= sizeof(xfs_dqhash_t); - ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); xqm->qm_dqhashmask = hsize - 1; @@ -137,7 +135,6 @@ xfs_Gqm_init(void) xqm->qm_dqtrxzone = qm_dqtrxzone; atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; xqm->qm_nrefs = 0; return xqm; @@ -154,12 +151,17 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { - struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); ASSERT(xqm->qm_nrefs == 0); + unregister_shrinker(&xfs_qm_shaker); + + mutex_lock(&xqm->qm_dqfrlist_lock); + ASSERT(list_empty(&xqm->qm_dqfrlist)); + mutex_unlock(&xqm->qm_dqfrlist_lock); + hsize = xqm->qm_dqhashmask + 1; for (i = 0; i < hsize; i++) { xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); @@ -171,17 +173,6 @@ xfs_qm_destroy( xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - /* frlist cleanup */ - mutex_lock(&xqm->qm_dqfrlist_lock); - list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } - mutex_unlock(&xqm->qm_dqfrlist_lock); - mutex_destroy(&xqm->qm_dqfrlist_lock); kmem_free(xqm); } @@ -232,34 +223,10 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *n; - ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); /* - * Go thru the freelist and destroy all inactive dquots. - */ - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - - list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } else { - xfs_dqunlock(dqp); - } - } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - - /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll * be restarted. */ @@ -415,8 +382,7 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - struct xfs_mount *mp, - int sync_mode) + struct xfs_mount *mp) { struct xfs_quotainfo *q = mp->m_quotainfo; int recl; @@ -429,7 +395,8 @@ again: mutex_lock(&q->qi_dqlist_lock); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if (! XFS_DQ_IS_DIRTY(dqp)) { + if ((dqp->dq_flags & XFS_DQ_FREEING) || + !XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); continue; } @@ -444,14 +411,14 @@ again: * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - xfs_qm_dqflock_pushbuf_wait(dqp); + xfs_dqflock_pushbuf_wait(dqp); } /* * Let go of the mplist lock. We don't want to hold it * across a disk write. */ mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, sync_mode); + error = xfs_qm_dqflush(dqp, 0); xfs_dqunlock(dqp); if (error) return error; @@ -468,6 +435,7 @@ again: /* return ! busy */ return 0; } + /* * Release the group dquot pointers the user dquots may be * carrying around as a hint. mplist is locked on entry and exit. @@ -478,31 +446,26 @@ xfs_qm_detach_gdquots( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *gdqp; - int nrecl; again: ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); - if ((gdqp = dqp->q_gdquot)) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - if (gdqp) { - /* - * Can't hold the mplist lock across a dqput. - * XXXmust convert to marker based iterations here. - */ - nrecl = q->qi_dqreclaims; + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); mutex_unlock(&q->qi_dqlist_lock); - xfs_qm_dqput(gdqp); - + delay(1); mutex_lock(&q->qi_dqlist_lock); - if (nrecl != q->qi_dqreclaims) - goto again; + goto again; } + + gdqp = dqp->q_gdquot; + if (gdqp) + dqp->q_gdquot = NULL; + xfs_dqunlock(dqp); + + if (gdqp) + xfs_qm_dqrele(gdqp); } } @@ -520,8 +483,8 @@ xfs_qm_dqpurge_int( struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dquot *dqp, *n; uint dqtype; - int nrecl; - int nmisses; + int nmisses = 0; + LIST_HEAD (dispose_list); if (!q) return 0; @@ -540,47 +503,26 @@ xfs_qm_dqpurge_int( */ xfs_qm_detach_gdquots(mp); - again: - nmisses = 0; - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* - * Try to get rid of all of the unwanted dquots. The idea is to - * get them off mplist and hashlist, but leave them on freelist. + * Try to get rid of all of the unwanted dquots. */ list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { - /* - * It's OK to look at the type without taking dqlock here. - * We're holding the mplist lock here, and that's needed for - * a dqreclaim. - */ - if ((dqp->dq_flags & dqtype) == 0) - continue; - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = q->qi_dqreclaims; - mutex_unlock(&q->qi_dqlist_lock); - mutex_lock(&dqp->q_hash->qh_lock); - mutex_lock(&q->qi_dqlist_lock); - - /* - * XXXTheoretically, we can get into a very long - * ping pong game here. - * No one can be adding dquots to the mplist at - * this point, but somebody might be taking things off. - */ - if (nrecl != q->qi_dqreclaims) { - mutex_unlock(&dqp->q_hash->qh_lock); - goto again; - } + xfs_dqlock(dqp); + if ((dqp->dq_flags & dqtype) != 0 && + !(dqp->dq_flags & XFS_DQ_FREEING)) { + if (dqp->q_nrefs == 0) { + dqp->dq_flags |= XFS_DQ_FREEING; + list_move_tail(&dqp->q_mplist, &dispose_list); + } else + nmisses++; } - - /* - * Take the dquot off the mplist and hashlist. It may remain on - * freelist in INACTIVE state. - */ - nmisses += xfs_qm_dqpurge(dqp); + xfs_dqunlock(dqp); } mutex_unlock(&q->qi_dqlist_lock); + + list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist) + xfs_qm_dqpurge(dqp); + return nmisses; } @@ -648,12 +590,9 @@ xfs_qm_dqattach_one( */ dqp = udqhint->q_gdquot; if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - xfs_dqunlock(dqp); + *IO_idqpp = xfs_qm_dqhold(dqp); xfs_dqunlock(udqhint); return 0; } @@ -693,11 +632,7 @@ xfs_qm_dqattach_one( /* * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. The idea sounds simple, but the - * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering constraints. - * The process is complicated more by the fact that the dquots may or may not - * be locked on entry. + * udquot as a hint for future lookups. */ STATIC void xfs_qm_dqattach_grouphint( @@ -708,45 +643,17 @@ xfs_qm_dqattach_grouphint( xfs_dqlock(udq); - if ((tmp = udq->q_gdquot)) { - if (tmp == gdq) { - xfs_dqunlock(udq); - return; - } + tmp = udq->q_gdquot; + if (tmp) { + if (tmp == gdq) + goto done; udq->q_gdquot = NULL; - /* - * We can't keep any dqlocks when calling dqrele, - * because the freelist lock comes before dqlocks. - */ - xfs_dqunlock(udq); - /* - * we took a hard reference once upon a time in dqget, - * so give it back when the udquot no longer points at it - * dqput() does the unlocking of the dquot. - */ xfs_qm_dqrele(tmp); - - xfs_dqlock(udq); - xfs_dqlock(gdq); - - } else { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - xfs_dqlock(gdq); } - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - /* - * Somebody could have attached a gdquot here, - * when we dropped the uqlock. If so, just do nothing. - */ - if (udq->q_gdquot == NULL) { - XFS_DQHOLD(gdq); - udq->q_gdquot = gdq; - } - - xfs_dqunlock(gdq); + udq->q_gdquot = xfs_qm_dqhold(gdq); +done: xfs_dqunlock(udq); } @@ -813,17 +720,13 @@ xfs_qm_dqattach_locked( ASSERT(ip->i_gdquot); /* - * We may or may not have the i_udquot locked at this point, - * but this check is OK since we don't depend on the i_gdquot to - * be accurate 100% all the time. It is just a hint, and this - * will succeed in general. + * We do not have i_udquot locked at this point, but this check + * is OK since we don't depend on the i_gdquot to be accurate + * 100% all the time. It is just a hint, and this will + * succeed in general. */ - if (ip->i_udquot->q_gdquot == ip->i_gdquot) - goto done; - /* - * Attach i_gdquot to the gdquot hint inside the i_udquot. - */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); + if (ip->i_udquot->q_gdquot != ip->i_gdquot) + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: @@ -879,100 +782,6 @@ xfs_qm_dqdetach( } } -int -xfs_qm_sync( - struct xfs_mount *mp, - int flags) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - int recl, restarts; - struct xfs_dquot *dqp; - int error; - - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return 0; - - restarts = 0; - - again: - mutex_lock(&q->qi_dqlist_lock); - /* - * dqpurge_all() also takes the mplist lock and iterate thru all dquots - * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared - * when we have the mplist lock, we know that dquots will be consistent - * as long as we have it locked. - */ - if (!XFS_IS_QUOTA_ON(mp)) { - mutex_unlock(&q->qi_dqlist_lock); - return 0; - } - ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); - list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { - /* - * If this is vfs_sync calling, then skip the dquots that - * don't 'seem' to be dirty. ie. don't acquire dqlock. - * This is very similar to what xfs_sync does with inodes. - */ - if (flags & SYNC_TRYLOCK) { - if (!XFS_DQ_IS_DIRTY(dqp)) - continue; - if (!xfs_qm_dqlock_nowait(dqp)) - continue; - } else { - xfs_dqlock(dqp); - } - - /* - * Now, find out for sure if this dquot is dirty or not. - */ - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = q->qi_dqreclaims; - if (!xfs_dqflock_nowait(dqp)) { - if (flags & SYNC_TRYLOCK) { - xfs_dqunlock(dqp); - continue; - } - /* - * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, so - * see if we can give a push to the buffer before we wait - * on the flush lock. At this point, we know that - * even though the dquot is being flushed, - * it has (new) dirty data. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write - */ - mutex_unlock(&q->qi_dqlist_lock); - error = xfs_qm_dqflush(dqp, flags); - xfs_dqunlock(dqp); - if (error && XFS_FORCED_SHUTDOWN(mp)) - return 0; /* Need to prevent umount failure */ - else if (error) - return error; - - mutex_lock(&q->qi_dqlist_lock); - if (recl != q->qi_dqreclaims) { - if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) - break; - - mutex_unlock(&q->qi_dqlist_lock); - goto again; - } - } - - mutex_unlock(&q->qi_dqlist_lock); - return 0; -} - /* * The hash chains and the mplist use the same xfs_dqhash structure as * their list head, but we can take the mplist qh_lock and one of the @@ -1034,18 +843,21 @@ xfs_qm_init_quotainfo( /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. + * * We look at the USR dquot with id == 0 first, but if user quotas * are not enabled we goto the GRP dquot with id == 0. * We don't really care to keep separate default limits for user * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. */ - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, - &dqp); - if (! error) { + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { xfs_disk_dquot_t *ddqp = &dqp->q_core; /* @@ -1072,11 +884,6 @@ xfs_qm_init_quotainfo( qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - /* - * We sent the XFS_QMOPT_DQSUSER flag to dqget because - * we don't want this dquot cached. We haven't done a - * quotacheck yet, and quotacheck doesn't like incore dquots. - */ xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -1661,7 +1468,7 @@ xfs_qm_quotacheck( * successfully. */ if (!error) - error = xfs_qm_dqflush_all(mp, 0); + error = xfs_qm_dqflush_all(mp); /* * We can get this error if we couldn't do a dquot allocation inside @@ -1790,257 +1597,150 @@ xfs_qm_init_quotainos( return 0; } - - -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) { - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int startagain; + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; - restarts = 0; - dqpout = NULL; + mutex_lock(&dqp->q_hash->qh_lock); + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + mutex_unlock(&dqp->q_hash->qh_lock); - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ -again: - startagain = 0; - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + mutex_lock(&qi->qi_dqlist_lock); + list_del_init(&dqp->q_mplist); + qi->qi_dquots--; + qi->qi_dqreclaims++; + mutex_unlock(&qi->qi_dqlist_lock); - list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { - struct xfs_mount *mp = dqp->q_mount; - xfs_dqlock(dqp); + xfs_qm_dqdestroy(dqp); +} - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - restarts++; - startagain = 1; - goto dqunlock; - } +STATIC void +xfs_qm_dqreclaim_one( + struct xfs_dquot *dqp, + struct list_head *dispose_list) +{ + struct xfs_mount *mp = dqp->q_mount; + int error; - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(mp == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(list_empty(&dqp->q_hashlist)); - ASSERT(list_empty(&dqp->q_mplist)); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - goto dqunlock; - } + if (!xfs_dqlock_nowait(dqp)) + goto out_busy; - ASSERT(dqp->q_hash); - ASSERT(!list_empty(&dqp->q_mplist)); + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); - /* - * Try to grab the flush lock. If this dquot is in the process - * of getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto dqunlock; + trace_xfs_dqreclaim_want(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + return; + } - trace_xfs_dqreclaim_dirty(dqp); + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - } - goto dqunlock; - } + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (!xfs_dqflock_nowait(dqp)) + goto out_busy; + + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + trace_xfs_dqreclaim_dirty(dqp); /* - * We're trying to get the hashlock out of order. This races - * with dqlookup; so, we giveup and goto the next dquot if - * we couldn't get the hashlock. This way, we won't starve - * a dqlookup process that holds the hashlock that is - * waiting for the freelist lock. + * We flush it delayed write, so don't bother releasing the + * freelist lock. */ - if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - restarts++; - goto dqfunlock; + error = xfs_qm_dqflush(dqp, 0); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); } /* - * This races with dquot allocation code as well as dqflush_all - * and reclaim code. So, if we failed to grab the mplist lock, - * giveup everything and start over. + * Give the dquot another try on the freelist, as the + * flushing will take some time. */ - if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { - restarts++; - startagain = 1; - goto qhunlock; - } - - ASSERT(dqp->q_nrefs == 0); - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dquots--; - mp->m_quotainfo->qi_dqreclaims++; - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - dqpout = dqp; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); -qhunlock: - mutex_unlock(&dqp->q_hash->qh_lock); -dqfunlock: - xfs_dqfunlock(dqp); -dqunlock: - xfs_dqunlock(dqp); - if (dqpout) - break; - if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - break; - if (startagain) { - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - goto again; - } + goto out_busy; } - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqpout; -} + xfs_dqfunlock(dqp); -/* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - */ -STATIC int -xfs_qm_shake_freelist( - int howmany) -{ - int nreclaimed = 0; - xfs_dquot_t *dqp; + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); - if (howmany <= 0) - return 0; + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_freelist, dispose_list); + xfs_Gqm->qm_dqfrlist_cnt--; - while (nreclaimed < howmany) { - dqp = xfs_qm_dqreclaim_one(); - if (!dqp) - return nreclaimed; - xfs_qm_dqdestroy(dqp); - nreclaimed++; - } - return nreclaimed; + trace_xfs_dqreclaim_done(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + return; + +out_busy: + xfs_dqunlock(dqp); + + /* + * Move the dquot to the tail of the list so that we don't spin on it. + */ + list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + + trace_xfs_dqreclaim_busy(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); } -/* - * The kmem_shake interface is invoked when memory is running low. - */ -/* ARGSUSED */ STATIC int xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) + struct shrinker *shrink, + struct shrink_control *sc) { - int ndqused, nfree, n; - gfp_t gfp_mask = sc->gfp_mask; - - if (!kmem_shake_allow(gfp_mask)) - return 0; - if (!xfs_Gqm) - return 0; - - nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ - /* incore dquots in all f/s's */ - ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; - - ASSERT(ndqused >= 0); + int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (dispose_list); + struct xfs_dquot *dqp; - if (nfree <= ndqused && nfree < ndquot) + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; + if (!nr_to_scan) + goto out; - ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ - n = nfree - ndqused - ndquot; /* # over target */ - - return xfs_qm_shake_freelist(MAX(nfree, n)); -} - - -/*------------------------------------------------------------------*/ - -/* - * Return a new incore dquot. Depending on the number of - * dquots in the system, we either allocate a new one on the kernel heap, - * or reclaim a free one. - * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed - * to reclaim an existing one from the freelist. - */ -boolean_t -xfs_qm_dqalloc_incore( - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - - /* - * Check against high water mark to see if we want to pop - * a nincompoop dquot off the freelist. - */ - if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { - /* - * Try to recycle a dquot from the freelist. - */ - if ((dqp = xfs_qm_dqreclaim_one())) { - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); - /* - * Just zero the core here. The rest will get - * reinitialized by caller. XXX we shouldn't even - * do this zero ... - */ - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - *O_dqpp = dqp; - return B_FALSE; - } - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + while (!list_empty(&xfs_Gqm->qm_dqfrlist)) { + if (nr_to_scan-- <= 0) + break; + dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot, + q_freelist); + xfs_qm_dqreclaim_one(dqp, &dispose_list); } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * Allocate a brand new dquot on the kernel heap and return it - * to the caller to initialize. - */ - ASSERT(xfs_Gqm->qm_dqzone != NULL); - *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); - atomic_inc(&xfs_Gqm->qm_totaldquots); - - return B_TRUE; + while (!list_empty(&dispose_list)) { + dqp = list_first_entry(&dispose_list, struct xfs_dquot, + q_freelist); + list_del_init(&dqp->q_freelist); + xfs_qm_dqfree_one(dqp); + } +out: + return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure; } - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. @@ -2151,10 +1851,7 @@ xfs_qm_vop_dqalloc( * this to caller */ ASSERT(ip->i_udquot); - uq = ip->i_udquot; - xfs_dqlock(uq); - XFS_DQHOLD(uq); - xfs_dqunlock(uq); + uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { @@ -2175,10 +1872,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { @@ -2198,10 +1892,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); + gq = xfs_qm_dqhold(ip->i_gdquot); } } if (uq) @@ -2251,14 +1942,10 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* - * Take an extra reference, because the inode - * is going to keep this dquot pointer even - * after the trans_commit. + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. */ - xfs_dqlock(newdq); - XFS_DQHOLD(newdq); - xfs_dqunlock(newdq); - *IO_olddq = newdq; + *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } @@ -2390,25 +2077,21 @@ xfs_qm_vop_create_dqattach( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp) { - xfs_dqlock(udqp); - XFS_DQHOLD(udqp); - xfs_dqunlock(udqp); ASSERT(ip->i_udquot == NULL); - ip->i_udquot = udqp; ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp) { - xfs_dqlock(gdqp); - XFS_DQHOLD(gdqp); - xfs_dqunlock(gdqp); ASSERT(ip->i_gdquot == NULL); - ip->i_gdquot = gdqp; ASSERT(XFS_IS_OQUOTA_ON(mp)); ASSERT((XFS_IS_GQUOTA_ON(mp) ? ip->i_d.di_gid : xfs_get_projid(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe1052..9a9b997e1a0 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -26,30 +26,12 @@ struct xfs_qm; struct xfs_inode; -extern uint ndquot; extern struct mutex xfs_Gqm_lock; extern struct xfs_qm *xfs_Gqm; extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; /* - * Used in xfs_qm_sync called by xfs_sync to count the max times that it can - * iterate over the mountpt's dquot list in one call. - */ -#define XFS_QM_SYNC_MAX_RESTARTS 7 - -/* - * Ditto, for xfs_qm_dqreclaim_one. - */ -#define XFS_QM_RECLAIM_MAX_RESTARTS 4 - -/* - * Ideal ratio of free to in use dquots. Quota manager makes an attempt - * to keep this balance. - */ -#define XFS_QM_DQFREE_RATIO 2 - -/* * Dquot hashtable constants/threshold values. */ #define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t)) @@ -80,7 +62,6 @@ typedef struct xfs_qm { int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ - int qm_dqfree_ratio;/* ratio of free to inuse dquots */ kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ } xfs_qm_t; @@ -149,7 +130,6 @@ extern int xfs_qm_quotacheck(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); /* dquot stuff */ -extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c index 8671a0b3264..5729ba57087 100644 --- a/fs/xfs/xfs_qm_stats.c +++ b/fs/xfs/xfs_qm_stats.c @@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ seq_printf(m, "%d\t%d\t%d\t%u\n", - ndquot, + 0, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + 0, xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5cc3dde1bc9..711a86e39ff 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -31,6 +31,7 @@ #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); goto out_unlock; } + ASSERT(ip->i_d.di_nextents == 0); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); @@ -807,11 +813,11 @@ xfs_qm_export_dquot( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index a595f29567f..8a0807e0f97 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,8 +87,7 @@ typedef struct xfs_dqblk { #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) @@ -97,8 +96,7 @@ typedef struct xfs_dqblk { { XFS_DQ_PROJ, "PROJ" }, \ { XFS_DQ_GROUP, "GROUP" }, \ { XFS_DQ_DIRTY, "DIRTY" }, \ - { XFS_DQ_WANT, "WANT" }, \ - { XFS_DQ_INACTIVE, "INACTIVE" } + { XFS_DQ_FREEING, "FREEING" } /* * In the worst case, when both user and group quotas are on, @@ -199,7 +197,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ @@ -326,7 +323,6 @@ extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); -extern int xfs_qm_sync(struct xfs_mount *, int); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); @@ -366,10 +362,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) -{ - return 0; -} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8a899496fd5..ee5b695c99a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -199,7 +199,6 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_flags |= XFS_MOUNT_DELAYLOG; /* * These can be overridden by the mount option parsing. @@ -353,11 +352,11 @@ xfs_parseargs( mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - mp->m_flags |= XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "delaylog is the default now, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - mp->m_flags &= ~XFS_MOUNT_DELAYLOG; xfs_warn(mp, - "nodelaylog is deprecated and will be removed in Linux 3.3"); + "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -395,13 +394,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DISCARD) && - !(mp->m_flags & XFS_MOUNT_DELAYLOG)) { - xfs_warn(mp, - "the discard option is incompatible with the nodelaylog option"); - return EINVAL; - } - #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { xfs_warn(mp, "quota support not available in this kernel."); @@ -501,7 +493,6 @@ xfs_showargs( { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, - { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { 0, NULL } }; @@ -837,14 +828,6 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - init_waitqueue_head(&ip->i_ipin_wait); - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&ip->i_flush); - complete(&ip->i_flush); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); @@ -1014,17 +997,10 @@ xfs_fs_sync_fs( int error; /* - * Not much we can do for the first async pass. Writing out the - * superblock would be counter-productive as we are going to redirty - * when writing out other data and metadata (and writing out a single - * block is quite fast anyway). - * - * Try to asynchronously kick off quota syncing at least. + * Doing anything during the async pass would be counterproductive. */ - if (!wait) { - xfs_qm_sync(mp, SYNC_TRYLOCK); + if (!wait) return 0; - } error = xfs_quiesce_data(mp); if (error) @@ -1238,9 +1214,9 @@ xfs_fs_unfreeze( STATIC int xfs_fs_show_options( struct seq_file *m, - struct vfsmount *mnt) + struct dentry *root) { - return -xfs_showargs(XFS_M(mnt->mnt_sb), m); + return -xfs_showargs(XFS_M(root->d_sb), m); } /* @@ -1621,12 +1597,12 @@ STATIC int __init xfs_init_workqueues(void) { /* - * max_active is set to 8 to give enough concurency to allow - * multiple work operations on each CPU to run. This allows multiple - * filesystems to be running sync work concurrently, and scales with - * the number of CPUs in the system. + * We never want to the same work item to run twice, reclaiming inodes + * or idling the log is not going to get any faster by multiple CPUs + * competing for ressources. Use the default large max_active value + * so that even lots of filesystems can perform these task in parallel. */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); if (!xfs_syncd_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index f0994aedcd1..40b75eecd2b 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -395,10 +395,7 @@ xfs_quiesce_data( */ xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); - xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_qm_sync(mp, SYNC_WAIT); - - /* force out the newly dirtied log buffers */ + /* force out the log */ xfs_log_force(mp, XFS_LOG_SYNC); /* write superblock and hoover up shutdown errors */ @@ -506,7 +503,6 @@ xfs_sync_worker( error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - error = xfs_qm_sync(mp, SYNC_TRYLOCK); /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); @@ -711,14 +707,13 @@ xfs_reclaim_inode_grab( return 1; /* - * do some unlocked checks first to avoid unnecessary lock traffic. - * The first is a flush lock check, the second is a already in reclaim - * check. Only do these checks if we are not going to block on locks. + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. */ if ((flags & SYNC_TRYLOCK) && - (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) return 1; - } /* * The radix tree lock here protects a thread in xfs_iget from racing diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 49403579887..bb134a81993 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -733,18 +733,15 @@ DEFINE_EVENT(xfs_dquot_class, name, \ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); -DEFINE_DQUOT_EVENT(xfs_dqinit); -DEFINE_DQUOT_EVENT(xfs_dqreuse); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); -DEFINE_DQUOT_EVENT(xfs_dqlookup_want); -DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -893,7 +890,6 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fsize_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, flags) @@ -902,17 +898,15 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " "offset 0x%llx count 0x%zx ioflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) @@ -980,7 +974,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) __field(int, type) @@ -992,7 +985,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; __entry->type = type; @@ -1000,13 +992,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " - "offset 0x%llx count %zd type %s " - "startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->new_size, __entry->offset, __entry->count, __print_symbolic(__entry->type, XFS_IO_TYPES), @@ -1033,26 +1023,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(xfs_ino_t, ino) __field(loff_t, isize) __field(loff_t, disize) - __field(loff_t, new_size) __field(loff_t, offset) __field(size_t, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->isize = ip->i_size; + __entry->isize = VFS_I(ip)->i_size; __entry->disize = ip->i_d.di_size; - __entry->new_size = ip->i_new_size; __entry->offset = offset; __entry->count = count; ), - TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " "offset 0x%llx count %zd", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, __entry->disize, - __entry->new_size, __entry->offset, __entry->count) ); @@ -1092,8 +1079,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ TP_ARGS(ip, new_size)) -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); -DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); TRACE_EVENT(xfs_pagecache_inval, TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), @@ -1570,7 +1557,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __field(xfs_ino_t, ino) __field(int, format) __field(int, nex) - __field(int, max_nex) __field(int, broot_size) __field(int, fork_off) ), @@ -1580,18 +1566,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->ino = ip->i_ino; __entry->format = ip->i_d.di_format; __entry->nex = ip->i_d.di_nextents; - __entry->max_nex = ip->i_df.if_ext_max; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "Max in-fork extents %d, broot size %d, fork offset %d", + "broot size %d, fork offset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), __entry->nex, - __entry->max_nex, __entry->broot_size, __entry->fork_off) ) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 1f35b2feca9..7adcdf15ae0 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1151,14 +1151,13 @@ xfs_trans_add_item( { struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp = tp->t_mountp); - ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); lidp->lid_item = lip; lidp->lid_flags = 0; - lidp->lid_size = 0; list_add_tail(&lidp->lid_trans, &tp->t_items); lip->li_desc = lidp; @@ -1210,219 +1209,6 @@ xfs_trans_free_items( } } -/* - * Unlock the items associated with a transaction. - * - * Items which were not logged should be freed. Those which were logged must - * still be tracked so they can be unpinned when the transaction commits. - */ -STATIC void -xfs_trans_unlock_items( - struct xfs_trans *tp, - xfs_lsn_t commit_lsn) -{ - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; - - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - xfs_trans_free_item_desc(lidp); - } -} - -/* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. - */ -static uint -xfs_trans_count_vecs( - struct xfs_trans *tp) -{ - int nvecs; - struct xfs_log_item_desc *lidp; - - nvecs = 1; - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return 0; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - } - - return nvecs; -} - -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -static void -xfs_trans_fill_vecs( - struct xfs_trans *tp, - struct xfs_log_iovec *log_vector) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_iovec *vecp; - uint nitems; - - /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. - */ - vecp = log_vector + 1; - - nitems = 0; - ASSERT(!list_empty(&tp->t_items)); - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* - * The item may be marked dirty but not log anything. This can - * be used to get called when a transaction is committed. - */ - if (lidp->lid_size) - nitems++; - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; - IOP_PIN(lidp->lid_item); - } - - /* - * Now that we've counted the number of items in this transaction, fill - * in the transaction header. Note that the transaction header does not - * have a log item. - */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} - -/* - * The committed item processing consists of calling the committed routine of - * each logged item, updating the item's position in the AIL if necessary, and - * unpinning each item. If the committed routine returns -1, then do nothing - * further with the item because it may have been freed. - * - * Since items are unlocked when they are copied to the incore log, it is - * possible for two transactions to be completing and manipulating the same - * item simultaneously. The AIL lock will protect the lsn field of each item. - * The value of this field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because otherwise - * they could be immediately flushed and we'd have to race with the flusher - * trying to pull the item from the AIL as we add it. - */ -static void -xfs_trans_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn, - int aborted) -{ - xfs_lsn_t item_lsn; - struct xfs_ail *ailp; - - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - item_lsn = IOP_COMMITTED(lip, commit_lsn); - - /* item_lsn of -1 means the item needs no further processing */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) - return; - - /* - * If the returned lsn is greater than what it contained before, update - * the location of the item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it is possible that a - * later transaction completing simultaneously with an earlier one - * using the same item could complete first with a higher lsn. This - * would cause the earlier transaction to fail the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn and update the - * position of the item in the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, unpin it so it can - * be flushed. Pass information about buffer stale state down from the - * log item flags, if anyone else stales the buffer we do not want to - * pay any attention to it. - */ - IOP_UNPIN(lip, 0); -} - -/* - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - */ -STATIC void -xfs_trans_committed( - void *arg, - int abortflag) -{ - struct xfs_trans *tp = arg; - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); - xfs_trans_free_item_desc(lidp); - } - - xfs_trans_free(tp); -} - static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, @@ -1538,258 +1324,6 @@ xfs_trans_committed_bulk( } /* - * Called from the trans_commit code when we notice that the filesystem is in - * the middle of a forced shutdown. - * - * When we are called here, we have already pinned all the items in the - * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called - * so we can simply walk the items in the transaction, unpin them with an abort - * flag and then free the items. Note that unpinning the items can result in - * them being freed immediately, so we need to use a safe list traversal method - * here. - */ -STATIC void -xfs_trans_uncommit( - struct xfs_trans *tp, - uint flags) -{ - struct xfs_log_item_desc *lidp, *n; - - list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) { - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN(lidp->lid_item, 1); - } - - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); - - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); - xfs_trans_free(tp); -} - -/* - * Format the transaction direct to the iclog. This isolates the physical - * transaction commit operation from the logical operation and hence allows - * other methods to be introduced without affecting the existing commit path. - */ -static int -xfs_trans_commit_iclog( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - int shutdown; - int error; - int log_flags = 0; - struct xlog_in_core *commit_iclog; -#define XFS_TRANS_LOGVEC_COUNT 16 - struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xfs_log_iovec *log_vector; - uint nvec; - - - /* - * Ask each log item how many log_vector entries it will - * need so we can figure out how many to allocate. - * Try to avoid the kmem_alloc() call in the common case - * by using a vector from the stack when it fits. - */ - nvec = xfs_trans_count_vecs(tp); - if (nvec == 0) { - return ENOMEM; /* triggers a shutdown! */ - } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { - log_vector = log_vector_fast; - } else { - log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * - sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - - /* - * Fill in the log_vector and pin the logged items, and - * then write the transaction to the log. - */ - xfs_trans_fill_vecs(tp, log_vector); - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); - - /* - * The transaction is committed incore here, and can go out to disk - * at any time after this call. However, all the items associated - * with the transaction are still locked and pinned in memory. - */ - *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - - tp->t_commit_lsn = *commit_lsn; - trace_xfs_trans_commit_lsn(tp); - - if (nvec > XFS_TRANS_LOGVEC_COUNT) - kmem_free(log_vector); - - /* - * If we got a log write error. Unpin the logitems that we - * had pinned, clean up, free trans structure, and return error. - */ - if (error || *commit_lsn == -1) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); - return XFS_ERROR(EIO); - } - - /* - * Once the transaction has committed, unused - * reservations need to be released and changes to - * the superblock need to be reflected in the in-core - * version. Do that now. - */ - xfs_trans_unreserve_and_mod_sb(tp); - - /* - * Tell the LM to call the transaction completion routine - * when the log write with LSN commit_lsn completes (e.g. - * when the transaction commit really hits the on-disk log). - * After this call we cannot reference tp, because the call - * can happen at any time and the call will free the transaction - * structure pointed to by tp. The only case where we call - * the completion routine (xfs_trans_committed) directly is - * if the log is turned off on a debug kernel or we're - * running in simulation mode (the log is explicitly turned - * off). - */ - tp->t_logcb.cb_func = xfs_trans_committed; - tp->t_logcb.cb_arg = tp; - - /* - * We need to pass the iclog buffer which was used for the - * transaction commit record into this function, and attach - * the callback to it. The callback must be attached before - * the items are unlocked to avoid racing with other threads - * waiting for an item to unlock. - */ - shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); - - /* - * Mark this thread as no longer being in a transaction - */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* - * Once all the items of the transaction have been copied - * to the in core log and the callback is attached, the - * items can be unlocked. - * - * This will free descriptors pointing to items which were - * not logged since there is nothing more to do with them. - * For items which were logged, we will keep pointers to them - * so they can be unpinned after the transaction commits to disk. - * This will also stamp each modified meta-data item with - * the commit lsn of this transaction for dependency tracking - * purposes. - */ - xfs_trans_unlock_items(tp, *commit_lsn); - - /* - * If we detected a log error earlier, finish committing - * the transaction now (unpin log items, etc). - * - * Order is critical here, to avoid using the transaction - * pointer after its been freed (by xfs_trans_committed - * either here now, or as a callback). We cannot do this - * step inside xfs_log_notify as was done earlier because - * of this issue. - */ - if (shutdown) - xfs_trans_committed(tp, XFS_LI_ABORTED); - - /* - * Now that the xfs_trans_committed callback has been attached, - * and the items are released we can finally allow the iclog to - * go to disk. - */ - return xfs_log_release_iclog(mp, commit_iclog); -} - -/* - * Walk the log items and allocate log vector structures for - * each item large enough to fit all the vectors they require. - * Note that this format differs from the old log vector format in - * that there is no transaction header in these log vectors. - */ -STATIC struct xfs_log_vec * -xfs_trans_alloc_log_vecs( - xfs_trans_t *tp) -{ - struct xfs_log_item_desc *lidp; - struct xfs_log_vec *lv = NULL; - struct xfs_log_vec *ret_lv = NULL; - - - /* Bail out if we didn't find a log item. */ - if (list_empty(&tp->t_items)) { - ASSERT(0); - return NULL; - } - - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_vec *new_lv; - - /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) - continue; - - /* Skip items that do not have any vectors for writing */ - lidp->lid_size = IOP_SIZE(lidp->lid_item); - if (!lidp->lid_size) - continue; - - new_lv = kmem_zalloc(sizeof(*new_lv) + - lidp->lid_size * sizeof(struct xfs_log_iovec), - KM_SLEEP); - - /* The allocated iovec region lies beyond the log vector. */ - new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = lidp->lid_size; - new_lv->lv_item = lidp->lid_item; - if (!ret_lv) - ret_lv = new_lv; - else - lv->lv_next = new_lv; - lv = new_lv; - } - - return ret_lv; -} - -static int -xfs_trans_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct xfs_log_vec *log_vector; - - /* - * Get each log item to allocate a vector structure for - * the log item to to pass to the log write code. The - * CIL commit code will format the vector and save it away. - */ - log_vector = xfs_trans_alloc_log_vecs(tp); - if (!log_vector) - return ENOMEM; - - xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); - - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free(tp); - return 0; -} - -/* * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical @@ -1845,17 +1379,16 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - if (mp->m_flags & XFS_MOUNT_DELAYLOG) - error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); - else - error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); - + error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); if (error == ENOMEM) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); error = XFS_ERROR(EIO); goto out_unreserve; } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + /* * If the transaction needs to be synchronous, then force the * log out now and wait for it. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3ae713c0abd..f6118703f20 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -163,9 +163,8 @@ typedef struct xfs_trans_header { */ struct xfs_log_item_desc { struct xfs_log_item *lid_item; - ushort lid_size; - unsigned char lid_flags; struct list_head lid_trans; + unsigned char lid_flags; }; #define XFS_LID_DIRTY 0x1 diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 4d00ee67792..c4ba366d24e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -649,12 +649,12 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - hardlimit <= nblks + *resbcountp) { + hardlimit < nblks + *resbcountp) { xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } if (softlimit > 0ULL && - softlimit <= nblks + *resbcountp) { + softlimit < nblks + *resbcountp) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, @@ -677,11 +677,13 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) { + if (hardlimit > 0ULL && + hardlimit < ninos + count) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; } - if (softlimit > 0ULL && count >= softlimit) { + if (softlimit > 0ULL && + softlimit < ninos + count) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 8b32d1a4c5a..89dbb4a5087 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -53,7 +53,7 @@ xfs_dir_ialloc( output: may be a new transaction. */ xfs_inode_t *dp, /* directory within whose allocate the inode. */ - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, prid_t prid, /* project id */ diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index 456fca31493..5eeab4690cf 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,7 +18,7 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, +extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, xfs_inode_t **, int *); extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ce9268a2f56..ebdb88840a4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -131,7 +131,8 @@ xfs_readlink( __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - return XFS_ERROR(EFSCORRUPTED); + error = XFS_ERROR(EFSCORRUPTED); + goto out; } @@ -175,7 +176,7 @@ xfs_free_eofblocks( * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); if (last_fsb <= end_fsb) return 0; @@ -226,7 +227,14 @@ xfs_free_eofblocks( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, ip->i_size); + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); if (error) { /* * If we get an error at this point we simply don't @@ -540,8 +548,8 @@ xfs_release( return 0; if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { @@ -618,7 +626,7 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_size != 0) || + ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && S_ISREG(ip->i_d.di_mode)); @@ -632,12 +640,12 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((S_ISREG(ip->i_d.di_mode) && - ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & + (VFS_I(ip)->i_size > 0 || + (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - (ip->i_delayed_blks != 0)))) { + ip->i_delayed_blks != 0))) { error = xfs_free_eofblocks(mp, ip, 0); if (error) return VN_INACTIVE_CACHE; @@ -670,13 +678,18 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_itruncate_data(&tp, ip, 0); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } + + ASSERT(ip->i_d.di_nextents == 0); } else if (S_ISLNK(ip->i_d.di_mode)) { /* @@ -822,7 +835,7 @@ int xfs_create( xfs_inode_t *dp, struct xfs_name *name, - mode_t mode, + umode_t mode, xfs_dev_t rdev, xfs_inode_t **ipp) { @@ -1481,7 +1494,7 @@ xfs_symlink( xfs_inode_t *dp, struct xfs_name *link_name, const char *target_path, - mode_t mode, + umode_t mode, xfs_inode_t **ipp) { xfs_mount_t *mp = dp->i_mount; @@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes( * since nothing can read beyond eof. The space will * be zeroed when the file is extended anyway. */ - if (startoff >= ip->i_size) + if (startoff >= XFS_ISIZE(ip)) return 0; - if (endoff > ip->i_size) - endoff = ip->i_size; + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -2260,7 +2273,7 @@ xfs_change_file_space( bf->l_start += offset; break; case 2: /*SEEK_END*/ - bf->l_start += ip->i_size; + bf->l_start += XFS_ISIZE(ip); break; default: return XFS_ERROR(EINVAL); @@ -2277,7 +2290,7 @@ xfs_change_file_space( bf->l_whence = 0; startoffset = bf->l_start; - fsize = ip->i_size; + fsize = XFS_ISIZE(ip); /* * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 35d3d513e1e..0c877cbde14 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -26,7 +26,7 @@ int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); @@ -35,7 +35,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, mode_t mode, struct xfs_inode **ipp); + const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); |